Statistics
| Branch: | Revision:

ffmpeg / libavcodec / vp8.c @ 7634771e

History | View | Annotate | Download (65.7 KB)

1
/**
2
 * VP8 compatible video decoder
3
 *
4
 * Copyright (C) 2010 David Conrad
5
 * Copyright (C) 2010 Ronald S. Bultje
6
 * Copyright (C) 2010 Jason Garrett-Glaser
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
#include "libavutil/imgutils.h"
26
#include "avcodec.h"
27
#include "vp56.h"
28
#include "vp8data.h"
29
#include "vp8dsp.h"
30
#include "h264pred.h"
31
#include "rectangle.h"
32

    
33
#if ARCH_ARM
34
#   include "arm/vp8.h"
35
#endif
36

    
37
typedef struct {
38
    uint8_t filter_level;
39
    uint8_t inner_limit;
40
    uint8_t inner_filter;
41
} VP8FilterStrength;
42

    
43
typedef struct {
44
    uint8_t skip;
45
    // todo: make it possible to check for at least (i4x4 or split_mv)
46
    // in one op. are others needed?
47
    uint8_t mode;
48
    uint8_t ref_frame;
49
    uint8_t partitioning;
50
    VP56mv mv;
51
    VP56mv bmv[16];
52
} VP8Macroblock;
53

    
54
typedef struct {
55
    AVCodecContext *avctx;
56
    DSPContext dsp;
57
    VP8DSPContext vp8dsp;
58
    H264PredContext hpc;
59
    vp8_mc_func put_pixels_tab[3][3][3];
60
    AVFrame frames[4];
61
    AVFrame *framep[4];
62
    uint8_t *edge_emu_buffer;
63
    VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
64
    int profile;
65

    
66
    int mb_width;   /* number of horizontal MB */
67
    int mb_height;  /* number of vertical MB */
68
    int linesize;
69
    int uvlinesize;
70

    
71
    int keyframe;
72
    int invisible;
73
    int update_last;    ///< update VP56_FRAME_PREVIOUS with the current one
74
    int update_golden;  ///< VP56_FRAME_NONE if not updated, or which frame to copy if so
75
    int update_altref;
76
    int deblock_filter;
77

    
78
    /**
79
     * If this flag is not set, all the probability updates
80
     * are discarded after this frame is decoded.
81
     */
82
    int update_probabilities;
83

    
84
    /**
85
     * All coefficients are contained in separate arith coding contexts.
86
     * There can be 1, 2, 4, or 8 of these after the header context.
87
     */
88
    int num_coeff_partitions;
89
    VP56RangeCoder coeff_partition[8];
90

    
91
    VP8Macroblock *macroblocks;
92
    VP8Macroblock *macroblocks_base;
93
    VP8FilterStrength *filter_strength;
94

    
95
    uint8_t *intra4x4_pred_mode_top;
96
    uint8_t intra4x4_pred_mode_left[4];
97
    uint8_t *segmentation_map;
98

    
99
    /**
100
     * Cache of the top row needed for intra prediction
101
     * 16 for luma, 8 for each chroma plane
102
     */
103
    uint8_t (*top_border)[16+8+8];
104

    
105
    /**
106
     * For coeff decode, we need to know whether the above block had non-zero
107
     * coefficients. This means for each macroblock, we need data for 4 luma
108
     * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
109
     * per macroblock. We keep the last row in top_nnz.
110
     */
111
    uint8_t (*top_nnz)[9];
112
    DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
113

    
114
    /**
115
     * This is the index plus one of the last non-zero coeff
116
     * for each of the blocks in the current macroblock.
117
     * So, 0 -> no coeffs
118
     *     1 -> dc-only (special transform)
119
     *     2+-> full transform
120
     */
121
    DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
122
    DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
123
    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
124
    uint8_t intra4x4_pred_mode_mb[16];
125

    
126
    int chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
127
    int segment;             ///< segment of the current macroblock
128
    VP56mv mv_min;
129
    VP56mv mv_max;
130

    
131
    int mbskip_enabled;
132
    int sign_bias[4]; ///< one state [0, 1] per ref frame type
133
    int ref_count[3];
134

    
135
    /**
136
     * Base parameters for segmentation, i.e. per-macroblock parameters.
137
     * These must be kept unchanged even if segmentation is not used for
138
     * a frame, since the values persist between interframes.
139
     */
140
    struct {
141
        int enabled;
142
        int absolute_vals;
143
        int update_map;
144
        int8_t base_quant[4];
145
        int8_t filter_level[4];     ///< base loop filter level
146
    } segmentation;
147

    
148
    /**
149
     * Macroblocks can have one of 4 different quants in a frame when
150
     * segmentation is enabled.
151
     * If segmentation is disabled, only the first segment's values are used.
152
     */
153
    struct {
154
        // [0] - DC qmul  [1] - AC qmul
155
        int16_t luma_qmul[2];
156
        int16_t luma_dc_qmul[2];    ///< luma dc-only block quant
157
        int16_t chroma_qmul[2];
158
    } qmat[4];
159

    
160
    struct {
161
        int simple;
162
        int level;
163
        int sharpness;
164
    } filter;
165

    
166
    struct {
167
        int enabled;    ///< whether each mb can have a different strength based on mode/ref
168

    
169
        /**
170
         * filter strength adjustment for the following macroblock modes:
171
         * [0-3] - i16x16 (always zero)
172
         * [4]   - i4x4
173
         * [5]   - zero mv
174
         * [6]   - inter modes except for zero or split mv
175
         * [7]   - split mv
176
         *  i16x16 modes never have any adjustment
177
         */
178
        int8_t mode[VP8_MVMODE_SPLIT+1];
179

    
180
        /**
181
         * filter strength adjustment for macroblocks that reference:
182
         * [0] - intra / VP56_FRAME_CURRENT
183
         * [1] - VP56_FRAME_PREVIOUS
184
         * [2] - VP56_FRAME_GOLDEN
185
         * [3] - altref / VP56_FRAME_GOLDEN2
186
         */
187
        int8_t ref[4];
188
    } lf_delta;
189

    
190
    /**
191
     * These are all of the updatable probabilities for binary decisions.
192
     * They are only implictly reset on keyframes, making it quite likely
193
     * for an interframe to desync if a prior frame's header was corrupt
194
     * or missing outright!
195
     */
196
    struct {
197
        uint8_t segmentid[3];
198
        uint8_t mbskip;
199
        uint8_t intra;
200
        uint8_t last;
201
        uint8_t golden;
202
        uint8_t pred16x16[4];
203
        uint8_t pred8x8c[3];
204
        /* Padded to allow overreads */
205
        uint8_t token[4][17][3][NUM_DCT_TOKENS-1];
206
        uint8_t mvc[2][19];
207
    } prob[2];
208
} VP8Context;
209

    
210
static void vp8_decode_flush(AVCodecContext *avctx)
211
{
212
    VP8Context *s = avctx->priv_data;
213
    int i;
214

    
215
    for (i = 0; i < 4; i++)
216
        if (s->frames[i].data[0])
217
            avctx->release_buffer(avctx, &s->frames[i]);
218
    memset(s->framep, 0, sizeof(s->framep));
219

    
220
    av_freep(&s->macroblocks_base);
221
    av_freep(&s->filter_strength);
222
    av_freep(&s->intra4x4_pred_mode_top);
223
    av_freep(&s->top_nnz);
224
    av_freep(&s->edge_emu_buffer);
225
    av_freep(&s->top_border);
226
    av_freep(&s->segmentation_map);
227

    
228
    s->macroblocks        = NULL;
229
}
230

    
231
static int update_dimensions(VP8Context *s, int width, int height)
232
{
233
    if (av_image_check_size(width, height, 0, s->avctx))
234
        return AVERROR_INVALIDDATA;
235

    
236
    vp8_decode_flush(s->avctx);
237

    
238
    avcodec_set_dimensions(s->avctx, width, height);
239

    
240
    s->mb_width  = (s->avctx->coded_width +15) / 16;
241
    s->mb_height = (s->avctx->coded_height+15) / 16;
242

    
243
    s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
244
    s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
245
    s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
246
    s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
247
    s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
248
    s->segmentation_map        = av_mallocz(s->mb_width*s->mb_height);
249

    
250
    if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
251
        !s->top_nnz || !s->top_border || !s->segmentation_map)
252
        return AVERROR(ENOMEM);
253

    
254
    s->macroblocks        = s->macroblocks_base + 1;
255

    
256
    return 0;
257
}
258

    
259
static void parse_segment_info(VP8Context *s)
260
{
261
    VP56RangeCoder *c = &s->c;
262
    int i;
263

    
264
    s->segmentation.update_map = vp8_rac_get(c);
265

    
266
    if (vp8_rac_get(c)) { // update segment feature data
267
        s->segmentation.absolute_vals = vp8_rac_get(c);
268

    
269
        for (i = 0; i < 4; i++)
270
            s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
271

    
272
        for (i = 0; i < 4; i++)
273
            s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
274
    }
275
    if (s->segmentation.update_map)
276
        for (i = 0; i < 3; i++)
277
            s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
278
}
279

    
280
static void update_lf_deltas(VP8Context *s)
281
{
282
    VP56RangeCoder *c = &s->c;
283
    int i;
284

    
285
    for (i = 0; i < 4; i++)
286
        s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
287

    
288
    for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
289
        s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
290
}
291

    
292
static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
293
{
294
    const uint8_t *sizes = buf;
295
    int i;
296

    
297
    s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
298

    
299
    buf      += 3*(s->num_coeff_partitions-1);
300
    buf_size -= 3*(s->num_coeff_partitions-1);
301
    if (buf_size < 0)
302
        return -1;
303

    
304
    for (i = 0; i < s->num_coeff_partitions-1; i++) {
305
        int size = AV_RL24(sizes + 3*i);
306
        if (buf_size - size < 0)
307
            return -1;
308

    
309
        ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
310
        buf      += size;
311
        buf_size -= size;
312
    }
313
    ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
314

    
315
    return 0;
316
}
317

    
318
static void get_quants(VP8Context *s)
319
{
320
    VP56RangeCoder *c = &s->c;
321
    int i, base_qi;
322

    
323
    int yac_qi     = vp8_rac_get_uint(c, 7);
324
    int ydc_delta  = vp8_rac_get_sint(c, 4);
325
    int y2dc_delta = vp8_rac_get_sint(c, 4);
326
    int y2ac_delta = vp8_rac_get_sint(c, 4);
327
    int uvdc_delta = vp8_rac_get_sint(c, 4);
328
    int uvac_delta = vp8_rac_get_sint(c, 4);
329

    
330
    for (i = 0; i < 4; i++) {
331
        if (s->segmentation.enabled) {
332
            base_qi = s->segmentation.base_quant[i];
333
            if (!s->segmentation.absolute_vals)
334
                base_qi += yac_qi;
335
        } else
336
            base_qi = yac_qi;
337

    
338
        s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip(base_qi + ydc_delta , 0, 127)];
339
        s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip(base_qi             , 0, 127)];
340
        s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip(base_qi + y2dc_delta, 0, 127)];
341
        s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip(base_qi + y2ac_delta, 0, 127)] / 100;
342
        s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip(base_qi + uvdc_delta, 0, 127)];
343
        s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip(base_qi + uvac_delta, 0, 127)];
344

    
345
        s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
346
        s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
347
    }
348
}
349

    
350
/**
351
 * Determine which buffers golden and altref should be updated with after this frame.
352
 * The spec isn't clear here, so I'm going by my understanding of what libvpx does
353
 *
354
 * Intra frames update all 3 references
355
 * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
356
 * If the update (golden|altref) flag is set, it's updated with the current frame
357
 *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
358
 * If the flag is not set, the number read means:
359
 *      0: no update
360
 *      1: VP56_FRAME_PREVIOUS
361
 *      2: update golden with altref, or update altref with golden
362
 */
363
static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
364
{
365
    VP56RangeCoder *c = &s->c;
366

    
367
    if (update)
368
        return VP56_FRAME_CURRENT;
369

    
370
    switch (vp8_rac_get_uint(c, 2)) {
371
    case 1:
372
        return VP56_FRAME_PREVIOUS;
373
    case 2:
374
        return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
375
    }
376
    return VP56_FRAME_NONE;
377
}
378

    
379
static void update_refs(VP8Context *s)
380
{
381
    VP56RangeCoder *c = &s->c;
382

    
383
    int update_golden = vp8_rac_get(c);
384
    int update_altref = vp8_rac_get(c);
385

    
386
    s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
387
    s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
388
}
389

    
390
static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
391
{
392
    VP56RangeCoder *c = &s->c;
393
    int header_size, hscale, vscale, i, j, k, l, m, ret;
394
    int width  = s->avctx->width;
395
    int height = s->avctx->height;
396

    
397
    s->keyframe  = !(buf[0] & 1);
398
    s->profile   =  (buf[0]>>1) & 7;
399
    s->invisible = !(buf[0] & 0x10);
400
    header_size  = AV_RL24(buf) >> 5;
401
    buf      += 3;
402
    buf_size -= 3;
403

    
404
    if (s->profile > 3)
405
        av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
406

    
407
    if (!s->profile)
408
        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
409
    else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
410
        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
411

    
412
    if (header_size > buf_size - 7*s->keyframe) {
413
        av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
414
        return AVERROR_INVALIDDATA;
415
    }
416

    
417
    if (s->keyframe) {
418
        if (AV_RL24(buf) != 0x2a019d) {
419
            av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
420
            return AVERROR_INVALIDDATA;
421
        }
422
        width  = AV_RL16(buf+3) & 0x3fff;
423
        height = AV_RL16(buf+5) & 0x3fff;
424
        hscale = buf[4] >> 6;
425
        vscale = buf[6] >> 6;
426
        buf      += 7;
427
        buf_size -= 7;
428

    
429
        if (hscale || vscale)
430
            av_log_missing_feature(s->avctx, "Upscaling", 1);
431

    
432
        s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
433
        for (i = 0; i < 4; i++)
434
            for (j = 0; j < 16; j++)
435
                memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
436
                       sizeof(s->prob->token[i][j]));
437
        memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
438
        memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
439
        memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
440
        memset(&s->segmentation, 0, sizeof(s->segmentation));
441
    }
442

    
443
    if (!s->macroblocks_base || /* first frame */
444
        width != s->avctx->width || height != s->avctx->height) {
445
        if ((ret = update_dimensions(s, width, height) < 0))
446
            return ret;
447
    }
448

    
449
    ff_vp56_init_range_decoder(c, buf, header_size);
450
    buf      += header_size;
451
    buf_size -= header_size;
452

    
453
    if (s->keyframe) {
454
        if (vp8_rac_get(c))
455
            av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
456
        vp8_rac_get(c); // whether we can skip clamping in dsp functions
457
    }
458

    
459
    if ((s->segmentation.enabled = vp8_rac_get(c)))
460
        parse_segment_info(s);
461
    else
462
        s->segmentation.update_map = 0; // FIXME: move this to some init function?
463

    
464
    s->filter.simple    = vp8_rac_get(c);
465
    s->filter.level     = vp8_rac_get_uint(c, 6);
466
    s->filter.sharpness = vp8_rac_get_uint(c, 3);
467

    
468
    if ((s->lf_delta.enabled = vp8_rac_get(c)))
469
        if (vp8_rac_get(c))
470
            update_lf_deltas(s);
471

    
472
    if (setup_partitions(s, buf, buf_size)) {
473
        av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
474
        return AVERROR_INVALIDDATA;
475
    }
476

    
477
    get_quants(s);
478

    
479
    if (!s->keyframe) {
480
        update_refs(s);
481
        s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
482
        s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
483
    }
484

    
485
    // if we aren't saving this frame's probabilities for future frames,
486
    // make a copy of the current probabilities
487
    if (!(s->update_probabilities = vp8_rac_get(c)))
488
        s->prob[1] = s->prob[0];
489

    
490
    s->update_last = s->keyframe || vp8_rac_get(c);
491

    
492
    for (i = 0; i < 4; i++)
493
        for (j = 0; j < 8; j++)
494
            for (k = 0; k < 3; k++)
495
                for (l = 0; l < NUM_DCT_TOKENS-1; l++)
496
                    if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
497
                        int prob = vp8_rac_get_uint(c, 8);
498
                        for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
499
                            s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
500
                    }
501

    
502
    if ((s->mbskip_enabled = vp8_rac_get(c)))
503
        s->prob->mbskip = vp8_rac_get_uint(c, 8);
504

    
505
    if (!s->keyframe) {
506
        s->prob->intra  = vp8_rac_get_uint(c, 8);
507
        s->prob->last   = vp8_rac_get_uint(c, 8);
508
        s->prob->golden = vp8_rac_get_uint(c, 8);
509

    
510
        if (vp8_rac_get(c))
511
            for (i = 0; i < 4; i++)
512
                s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
513
        if (vp8_rac_get(c))
514
            for (i = 0; i < 3; i++)
515
                s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
516

    
517
        // 17.2 MV probability update
518
        for (i = 0; i < 2; i++)
519
            for (j = 0; j < 19; j++)
520
                if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
521
                    s->prob->mvc[i][j] = vp8_rac_get_nn(c);
522
    }
523

    
524
    return 0;
525
}
526

    
527
static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
528
{
529
    dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
530
    dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
531
}
532

    
533
/**
534
 * Motion vector coding, 17.1.
535
 */
536
static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
537
{
538
    int bit, x = 0;
539

    
540
    if (vp56_rac_get_prob_branchy(c, p[0])) {
541
        int i;
542

    
543
        for (i = 0; i < 3; i++)
544
            x += vp56_rac_get_prob(c, p[9 + i]) << i;
545
        for (i = 9; i > 3; i--)
546
            x += vp56_rac_get_prob(c, p[9 + i]) << i;
547
        if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
548
            x += 8;
549
    } else {
550
        // small_mvtree
551
        const uint8_t *ps = p+2;
552
        bit = vp56_rac_get_prob(c, *ps);
553
        ps += 1 + 3*bit;
554
        x  += 4*bit;
555
        bit = vp56_rac_get_prob(c, *ps);
556
        ps += 1 + bit;
557
        x  += 2*bit;
558
        x  += vp56_rac_get_prob(c, *ps);
559
    }
560

    
561
    return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
562
}
563

    
564
static av_always_inline
565
const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
566
{
567
    if (left == top)
568
        return vp8_submv_prob[4-!!left];
569
    if (!top)
570
        return vp8_submv_prob[2];
571
    return vp8_submv_prob[1-!!left];
572
}
573

    
574
/**
575
 * Split motion vector prediction, 16.4.
576
 * @returns the number of motion vectors parsed (2, 4 or 16)
577
 */
578
static av_always_inline
579
int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
580
{
581
    int part_idx;
582
    int n, num;
583
    VP8Macroblock *top_mb  = &mb[2];
584
    VP8Macroblock *left_mb = &mb[-1];
585
    const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
586
                  *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
587
                  *mbsplits_cur, *firstidx;
588
    VP56mv *top_mv  = top_mb->bmv;
589
    VP56mv *left_mv = left_mb->bmv;
590
    VP56mv *cur_mv  = mb->bmv;
591

    
592
    if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
593
        if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
594
            part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
595
        } else {
596
            part_idx = VP8_SPLITMVMODE_8x8;
597
        }
598
    } else {
599
        part_idx = VP8_SPLITMVMODE_4x4;
600
    }
601

    
602
    num = vp8_mbsplit_count[part_idx];
603
    mbsplits_cur = vp8_mbsplits[part_idx],
604
    firstidx = vp8_mbfirstidx[part_idx];
605
    mb->partitioning = part_idx;
606

    
607
    for (n = 0; n < num; n++) {
608
        int k = firstidx[n];
609
        uint32_t left, above;
610
        const uint8_t *submv_prob;
611

    
612
        if (!(k & 3))
613
            left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
614
        else
615
            left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
616
        if (k <= 3)
617
            above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
618
        else
619
            above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
620

    
621
        submv_prob = get_submv_prob(left, above);
622

    
623
        if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
624
            if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
625
                if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
626
                    mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
627
                    mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
628
                } else {
629
                    AV_ZERO32(&mb->bmv[n]);
630
                }
631
            } else {
632
                AV_WN32A(&mb->bmv[n], above);
633
            }
634
        } else {
635
            AV_WN32A(&mb->bmv[n], left);
636
        }
637
    }
638

    
639
    return num;
640
}
641

    
642
static av_always_inline
643
void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
644
{
645
    VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
646
                                  mb - 1 /* left */,
647
                                  mb + 1 /* top-left */ };
648
    enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
649
    enum { EDGE_TOP, EDGE_LEFT, EDGE_TOPLEFT };
650
    int idx = CNT_ZERO;
651
    int cur_sign_bias = s->sign_bias[mb->ref_frame];
652
    int *sign_bias = s->sign_bias;
653
    VP56mv near_mv[4];
654
    uint8_t cnt[4] = { 0 };
655
    VP56RangeCoder *c = &s->c;
656

    
657
    AV_ZERO32(&near_mv[0]);
658
    AV_ZERO32(&near_mv[1]);
659
    AV_ZERO32(&near_mv[2]);
660

    
661
    /* Process MB on top, left and top-left */
662
    #define MV_EDGE_CHECK(n)\
663
    {\
664
        VP8Macroblock *edge = mb_edge[n];\
665
        int edge_ref = edge->ref_frame;\
666
        if (edge_ref != VP56_FRAME_CURRENT) {\
667
            uint32_t mv = AV_RN32A(&edge->mv);\
668
            if (mv) {\
669
                if (cur_sign_bias != sign_bias[edge_ref]) {\
670
                    /* SWAR negate of the values in mv. */\
671
                    mv = ~mv;\
672
                    mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
673
                }\
674
                if (!n || mv != AV_RN32A(&near_mv[idx]))\
675
                    AV_WN32A(&near_mv[++idx], mv);\
676
                cnt[idx]      += 1 + (n != 2);\
677
            } else\
678
                cnt[CNT_ZERO] += 1 + (n != 2);\
679
        }\
680
    }
681

    
682
    MV_EDGE_CHECK(0)
683
    MV_EDGE_CHECK(1)
684
    MV_EDGE_CHECK(2)
685

    
686
    mb->partitioning = VP8_SPLITMVMODE_NONE;
687
    if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
688
        mb->mode = VP8_MVMODE_MV;
689

    
690
        /* If we have three distinct MVs, merge first and last if they're the same */
691
        if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1+EDGE_TOP]) == AV_RN32A(&near_mv[1+EDGE_TOPLEFT]))
692
            cnt[CNT_NEAREST] += 1;
693

    
694
        /* Swap near and nearest if necessary */
695
        if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
696
            FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
697
            FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
698
        }
699

    
700
        if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
701
            if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
702

    
703
                /* Choose the best mv out of 0,0 and the nearest mv */
704
                clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
705
                cnt[CNT_SPLITMV] = ((mb_edge[EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
706
                                    (mb_edge[EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
707
                                    (mb_edge[EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
708

    
709
                if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
710
                    mb->mode = VP8_MVMODE_SPLIT;
711
                    mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
712
                } else {
713
                    mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
714
                    mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
715
                    mb->bmv[0] = mb->mv;
716
                }
717
            } else {
718
                clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
719
                mb->bmv[0] = mb->mv;
720
            }
721
        } else {
722
            clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
723
            mb->bmv[0] = mb->mv;
724
        }
725
    } else {
726
        mb->mode = VP8_MVMODE_ZERO;
727
        AV_ZERO32(&mb->mv);
728
        mb->bmv[0] = mb->mv;
729
    }
730
}
731

    
732
static av_always_inline
733
void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
734
                           int mb_x, int keyframe)
735
{
736
    uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
737
    if (keyframe) {
738
        int x, y;
739
        uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
740
        uint8_t* const left = s->intra4x4_pred_mode_left;
741
        for (y = 0; y < 4; y++) {
742
            for (x = 0; x < 4; x++) {
743
                const uint8_t *ctx;
744
                ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
745
                *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
746
                left[y] = top[x] = *intra4x4;
747
                intra4x4++;
748
            }
749
        }
750
    } else {
751
        int i;
752
        for (i = 0; i < 16; i++)
753
            intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
754
    }
755
}
756

    
757
static av_always_inline
758
void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment)
759
{
760
    VP56RangeCoder *c = &s->c;
761

    
762
    if (s->segmentation.update_map)
763
        *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
764
    s->segment = *segment;
765

    
766
    mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
767

    
768
    if (s->keyframe) {
769
        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
770

    
771
        if (mb->mode == MODE_I4x4) {
772
            decode_intra4x4_modes(s, c, mb_x, 1);
773
        } else {
774
            const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
775
            AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
776
            AV_WN32A(s->intra4x4_pred_mode_left, modes);
777
        }
778

    
779
        s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
780
        mb->ref_frame = VP56_FRAME_CURRENT;
781
    } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
782
        // inter MB, 16.2
783
        if (vp56_rac_get_prob_branchy(c, s->prob->last))
784
            mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
785
                VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
786
        else
787
            mb->ref_frame = VP56_FRAME_PREVIOUS;
788
        s->ref_count[mb->ref_frame-1]++;
789

    
790
        // motion vectors, 16.3
791
        decode_mvs(s, mb, mb_x, mb_y);
792
    } else {
793
        // intra MB, 16.1
794
        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
795

    
796
        if (mb->mode == MODE_I4x4)
797
            decode_intra4x4_modes(s, c, mb_x, 0);
798

    
799
        s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
800
        mb->ref_frame = VP56_FRAME_CURRENT;
801
        mb->partitioning = VP8_SPLITMVMODE_NONE;
802
        AV_ZERO32(&mb->bmv[0]);
803
    }
804
}
805

    
806
#ifndef decode_block_coeffs_internal
807
/**
808
 * @param c arithmetic bitstream reader context
809
 * @param block destination for block coefficients
810
 * @param probs probabilities to use when reading trees from the bitstream
811
 * @param i initial coeff index, 0 unless a separate DC block is coded
812
 * @param zero_nhood the initial prediction context for number of surrounding
813
 *                   all-zero blocks (only left/top, so 0-2)
814
 * @param qmul array holding the dc/ac dequant factor at position 0/1
815
 * @return 0 if no coeffs were decoded
816
 *         otherwise, the index of the last coeff decoded plus one
817
 */
818
static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
819
                                        uint8_t probs[8][3][NUM_DCT_TOKENS-1],
820
                                        int i, uint8_t *token_prob, int16_t qmul[2])
821
{
822
    goto skip_eob;
823
    do {
824
        int coeff;
825
        if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
826
            return i;
827

    
828
skip_eob:
829
        if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
830
            if (++i == 16)
831
                return i; // invalid input; blocks should end with EOB
832
            token_prob = probs[i][0];
833
            goto skip_eob;
834
        }
835

    
836
        if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
837
            coeff = 1;
838
            token_prob = probs[i+1][1];
839
        } else {
840
            if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
841
                coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
842
                if (coeff)
843
                    coeff += vp56_rac_get_prob(c, token_prob[5]);
844
                coeff += 2;
845
            } else {
846
                // DCT_CAT*
847
                if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
848
                    if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
849
                        coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
850
                    } else {                                    // DCT_CAT2
851
                        coeff  = 7;
852
                        coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
853
                        coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
854
                    }
855
                } else {    // DCT_CAT3 and up
856
                    int a = vp56_rac_get_prob(c, token_prob[8]);
857
                    int b = vp56_rac_get_prob(c, token_prob[9+a]);
858
                    int cat = (a<<1) + b;
859
                    coeff  = 3 + (8<<cat);
860
                    coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
861
                }
862
            }
863
            token_prob = probs[i+1][2];
864
        }
865
        block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
866
    } while (++i < 16);
867

    
868
    return i;
869
}
870
#endif
871

    
872
static av_always_inline
873
int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
874
                        uint8_t probs[8][3][NUM_DCT_TOKENS-1],
875
                        int i, int zero_nhood, int16_t qmul[2])
876
{
877
    uint8_t *token_prob = probs[i][zero_nhood];
878
    if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
879
        return 0;
880
    return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
881
}
882

    
883
static av_always_inline
884
void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
885
                      uint8_t t_nnz[9], uint8_t l_nnz[9])
886
{
887
    int i, x, y, luma_start = 0, luma_ctx = 3;
888
    int nnz_pred, nnz, nnz_total = 0;
889
    int segment = s->segment;
890
    int block_dc = 0;
891

    
892
    if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
893
        nnz_pred = t_nnz[8] + l_nnz[8];
894

    
895
        // decode DC values and do hadamard
896
        nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
897
                                  s->qmat[segment].luma_dc_qmul);
898
        l_nnz[8] = t_nnz[8] = !!nnz;
899
        if (nnz) {
900
            nnz_total += nnz;
901
            block_dc = 1;
902
            if (nnz == 1)
903
                s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
904
            else
905
                s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
906
        }
907
        luma_start = 1;
908
        luma_ctx = 0;
909
    }
910

    
911
    // luma blocks
912
    for (y = 0; y < 4; y++)
913
        for (x = 0; x < 4; x++) {
914
            nnz_pred = l_nnz[y] + t_nnz[x];
915
            nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
916
                                      nnz_pred, s->qmat[segment].luma_qmul);
917
            // nnz+block_dc may be one more than the actual last index, but we don't care
918
            s->non_zero_count_cache[y][x] = nnz + block_dc;
919
            t_nnz[x] = l_nnz[y] = !!nnz;
920
            nnz_total += nnz;
921
        }
922

    
923
    // chroma blocks
924
    // TODO: what to do about dimensions? 2nd dim for luma is x,
925
    // but for chroma it's (y<<1)|x
926
    for (i = 4; i < 6; i++)
927
        for (y = 0; y < 2; y++)
928
            for (x = 0; x < 2; x++) {
929
                nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
930
                nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
931
                                          nnz_pred, s->qmat[segment].chroma_qmul);
932
                s->non_zero_count_cache[i][(y<<1)+x] = nnz;
933
                t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
934
                nnz_total += nnz;
935
            }
936

    
937
    // if there were no coded coeffs despite the macroblock not being marked skip,
938
    // we MUST not do the inner loop filter and should not do IDCT
939
    // Since skip isn't used for bitstream prediction, just manually set it.
940
    if (!nnz_total)
941
        mb->skip = 1;
942
}
943

    
944
static av_always_inline
945
void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
946
                      int linesize, int uvlinesize, int simple)
947
{
948
    AV_COPY128(top_border, src_y + 15*linesize);
949
    if (!simple) {
950
        AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
951
        AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
952
    }
953
}
954

    
955
static av_always_inline
956
void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
957
                    int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
958
                    int simple, int xchg)
959
{
960
    uint8_t *top_border_m1 = top_border-32;     // for TL prediction
961
    src_y  -=   linesize;
962
    src_cb -= uvlinesize;
963
    src_cr -= uvlinesize;
964

    
965
#define XCHG(a,b,xchg) do {                     \
966
        if (xchg) AV_SWAP64(b,a);               \
967
        else      AV_COPY64(b,a);               \
968
    } while (0)
969

    
970
    XCHG(top_border_m1+8, src_y-8, xchg);
971
    XCHG(top_border,      src_y,   xchg);
972
    XCHG(top_border+8,    src_y+8, 1);
973
    if (mb_x < mb_width-1)
974
        XCHG(top_border+32, src_y+16, 1);
975

    
976
    // only copy chroma for normal loop filter
977
    // or to initialize the top row to 127
978
    if (!simple || !mb_y) {
979
        XCHG(top_border_m1+16, src_cb-8, xchg);
980
        XCHG(top_border_m1+24, src_cr-8, xchg);
981
        XCHG(top_border+16,    src_cb, 1);
982
        XCHG(top_border+24,    src_cr, 1);
983
    }
984
}
985

    
986
static av_always_inline
987
int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
988
{
989
    if (!mb_x) {
990
        return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
991
    } else {
992
        return mb_y ? mode : LEFT_DC_PRED8x8;
993
    }
994
}
995

    
996
static av_always_inline
997
int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
998
{
999
    if (!mb_x) {
1000
        return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
1001
    } else {
1002
        return mb_y ? mode : HOR_PRED8x8;
1003
    }
1004
}
1005

    
1006
static av_always_inline
1007
int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
1008
{
1009
    if (mode == DC_PRED8x8) {
1010
        return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1011
    } else {
1012
        return mode;
1013
    }
1014
}
1015

    
1016
static av_always_inline
1017
int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
1018
{
1019
    switch (mode) {
1020
    case DC_PRED8x8:
1021
        return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1022
    case VERT_PRED8x8:
1023
        return !mb_y ? DC_127_PRED8x8 : mode;
1024
    case HOR_PRED8x8:
1025
        return !mb_x ? DC_129_PRED8x8 : mode;
1026
    case PLANE_PRED8x8 /*TM*/:
1027
        return check_tm_pred8x8_mode(mode, mb_x, mb_y);
1028
    }
1029
    return mode;
1030
}
1031

    
1032
static av_always_inline
1033
int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
1034
{
1035
    if (!mb_x) {
1036
        return mb_y ? VERT_VP8_PRED : DC_129_PRED;
1037
    } else {
1038
        return mb_y ? mode : HOR_VP8_PRED;
1039
    }
1040
}
1041

    
1042
static av_always_inline
1043
int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1044
{
1045
    switch (mode) {
1046
    case VERT_PRED:
1047
        if (!mb_x && mb_y) {
1048
            *copy_buf = 1;
1049
            return mode;
1050
        }
1051
        /* fall-through */
1052
    case DIAG_DOWN_LEFT_PRED:
1053
    case VERT_LEFT_PRED:
1054
        return !mb_y ? DC_127_PRED : mode;
1055
    case HOR_PRED:
1056
        if (!mb_y) {
1057
            *copy_buf = 1;
1058
            return mode;
1059
        }
1060
        /* fall-through */
1061
    case HOR_UP_PRED:
1062
        return !mb_x ? DC_129_PRED : mode;
1063
    case TM_VP8_PRED:
1064
        return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1065
    case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1066
    case DIAG_DOWN_RIGHT_PRED:
1067
    case VERT_RIGHT_PRED:
1068
    case HOR_DOWN_PRED:
1069
        if (!mb_y || !mb_x)
1070
            *copy_buf = 1;
1071
        return mode;
1072
    }
1073
    return mode;
1074
}
1075

    
1076
static av_always_inline
1077
void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1078
                   int mb_x, int mb_y)
1079
{
1080
    AVCodecContext *avctx = s->avctx;
1081
    int x, y, mode, nnz, tr;
1082

    
1083
    // for the first row, we need to run xchg_mb_border to init the top edge to 127
1084
    // otherwise, skip it if we aren't going to deblock
1085
    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1086
        xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1087
                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1088
                       s->filter.simple, 1);
1089

    
1090
    if (mb->mode < MODE_I4x4) {
1091
        if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1092
            mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1093
        } else {
1094
            mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1095
        }
1096
        s->hpc.pred16x16[mode](dst[0], s->linesize);
1097
    } else {
1098
        uint8_t *ptr = dst[0];
1099
        uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
1100
        uint8_t tr_top[4] = { 127, 127, 127, 127 };
1101

    
1102
        // all blocks on the right edge of the macroblock use bottom edge
1103
        // the top macroblock for their topright edge
1104
        uint8_t *tr_right = ptr - s->linesize + 16;
1105

    
1106
        // if we're on the right edge of the frame, said edge is extended
1107
        // from the top macroblock
1108
        if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1109
            mb_x == s->mb_width-1) {
1110
            tr = tr_right[-1]*0x01010101;
1111
            tr_right = (uint8_t *)&tr;
1112
        }
1113

    
1114
        if (mb->skip)
1115
            AV_ZERO128(s->non_zero_count_cache);
1116

    
1117
        for (y = 0; y < 4; y++) {
1118
            uint8_t *topright = ptr + 4 - s->linesize;
1119
            for (x = 0; x < 4; x++) {
1120
                int copy = 0, linesize = s->linesize;
1121
                uint8_t *dst = ptr+4*x;
1122
                DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1123

    
1124
                if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1125
                    topright = tr_top;
1126
                } else if (x == 3)
1127
                    topright = tr_right;
1128

    
1129
                if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1130
                    mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1131
                    if (copy) {
1132
                        dst = copy_dst + 12;
1133
                        linesize = 8;
1134
                        if (!(mb_y + y)) {
1135
                            copy_dst[3] = 127U;
1136
                            AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1137
                        } else {
1138
                            AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1139
                            if (!(mb_x + x)) {
1140
                                copy_dst[3] = 129U;
1141
                            } else {
1142
                                copy_dst[3] = ptr[4*x-s->linesize-1];
1143
                            }
1144
                        }
1145
                        if (!(mb_x + x)) {
1146
                            copy_dst[11] =
1147
                            copy_dst[19] =
1148
                            copy_dst[27] =
1149
                            copy_dst[35] = 129U;
1150
                        } else {
1151
                            copy_dst[11] = ptr[4*x              -1];
1152
                            copy_dst[19] = ptr[4*x+s->linesize  -1];
1153
                            copy_dst[27] = ptr[4*x+s->linesize*2-1];
1154
                            copy_dst[35] = ptr[4*x+s->linesize*3-1];
1155
                        }
1156
                    }
1157
                } else {
1158
                    mode = intra4x4[x];
1159
                }
1160
                s->hpc.pred4x4[mode](dst, topright, linesize);
1161
                if (copy) {
1162
                    AV_COPY32(ptr+4*x              , copy_dst+12);
1163
                    AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1164
                    AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1165
                    AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1166
                }
1167

    
1168
                nnz = s->non_zero_count_cache[y][x];
1169
                if (nnz) {
1170
                    if (nnz == 1)
1171
                        s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1172
                    else
1173
                        s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1174
                }
1175
                topright += 4;
1176
            }
1177

    
1178
            ptr   += 4*s->linesize;
1179
            intra4x4 += 4;
1180
        }
1181
    }
1182

    
1183
    if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1184
        mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1185
    } else {
1186
        mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1187
    }
1188
    s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1189
    s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1190

    
1191
    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1192
        xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1193
                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1194
                       s->filter.simple, 0);
1195
}
1196

    
1197
static const uint8_t subpel_idx[3][8] = {
1198
    { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1199
                                // also function pointer index
1200
    { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1201
    { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1202
};
1203

    
1204
/**
1205
 * Generic MC function.
1206
 *
1207
 * @param s VP8 decoding context
1208
 * @param luma 1 for luma (Y) planes, 0 for chroma (Cb/Cr) planes
1209
 * @param dst target buffer for block data at block position
1210
 * @param src reference picture buffer at origin (0, 0)
1211
 * @param mv motion vector (relative to block position) to get pixel data from
1212
 * @param x_off horizontal position of block from origin (0, 0)
1213
 * @param y_off vertical position of block from origin (0, 0)
1214
 * @param block_w width of block (16, 8 or 4)
1215
 * @param block_h height of block (always same as block_w)
1216
 * @param width width of src/dst plane data
1217
 * @param height height of src/dst plane data
1218
 * @param linesize size of a single line of plane data, including padding
1219
 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1220
 */
1221
static av_always_inline
1222
void vp8_mc_luma(VP8Context *s, uint8_t *dst, uint8_t *src, const VP56mv *mv,
1223
                 int x_off, int y_off, int block_w, int block_h,
1224
                 int width, int height, int linesize,
1225
                 vp8_mc_func mc_func[3][3])
1226
{
1227
    if (AV_RN32A(mv)) {
1228

    
1229
        int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1230
        int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1231

    
1232
        x_off += mv->x >> 2;
1233
        y_off += mv->y >> 2;
1234

    
1235
        // edge emulation
1236
        src += y_off * linesize + x_off;
1237
        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1238
            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1239
            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1240
                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1241
                                    x_off - mx_idx, y_off - my_idx, width, height);
1242
            src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1243
        }
1244
        mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1245
    } else
1246
        mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1247
}
1248

    
1249
static av_always_inline
1250
void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, uint8_t *src1,
1251
                   uint8_t *src2, const VP56mv *mv, int x_off, int y_off,
1252
                   int block_w, int block_h, int width, int height, int linesize,
1253
                   vp8_mc_func mc_func[3][3])
1254
{
1255
    if (AV_RN32A(mv)) {
1256
        int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1257
        int my = mv->y&7, my_idx = subpel_idx[0][my];
1258

    
1259
        x_off += mv->x >> 3;
1260
        y_off += mv->y >> 3;
1261

    
1262
        // edge emulation
1263
        src1 += y_off * linesize + x_off;
1264
        src2 += y_off * linesize + x_off;
1265
        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1266
            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1267
            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1268
                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1269
                                    x_off - mx_idx, y_off - my_idx, width, height);
1270
            src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1271
            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1272

    
1273
            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1274
                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1275
                                    x_off - mx_idx, y_off - my_idx, width, height);
1276
            src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1277
            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1278
        } else {
1279
            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1280
            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1281
        }
1282
    } else {
1283
        mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1284
        mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1285
    }
1286
}
1287

    
1288
static av_always_inline
1289
void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1290
                 AVFrame *ref_frame, int x_off, int y_off,
1291
                 int bx_off, int by_off,
1292
                 int block_w, int block_h,
1293
                 int width, int height, VP56mv *mv)
1294
{
1295
    VP56mv uvmv = *mv;
1296

    
1297
    /* Y */
1298
    vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1299
                ref_frame->data[0], mv, x_off + bx_off, y_off + by_off,
1300
                block_w, block_h, width, height, s->linesize,
1301
                s->put_pixels_tab[block_w == 8]);
1302

    
1303
    /* U/V */
1304
    if (s->profile == 3) {
1305
        uvmv.x &= ~7;
1306
        uvmv.y &= ~7;
1307
    }
1308
    x_off   >>= 1; y_off   >>= 1;
1309
    bx_off  >>= 1; by_off  >>= 1;
1310
    width   >>= 1; height  >>= 1;
1311
    block_w >>= 1; block_h >>= 1;
1312
    vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1313
                  dst[2] + by_off * s->uvlinesize + bx_off, ref_frame->data[1],
1314
                  ref_frame->data[2], &uvmv, x_off + bx_off, y_off + by_off,
1315
                  block_w, block_h, width, height, s->uvlinesize,
1316
                  s->put_pixels_tab[1 + (block_w == 4)]);
1317
}
1318

    
1319
/* Fetch pixels for estimated mv 4 macroblocks ahead.
1320
 * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1321
static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1322
{
1323
    /* Don't prefetch refs that haven't been used very often this frame. */
1324
    if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1325
        int x_off = mb_x << 4, y_off = mb_y << 4;
1326
        int mx = (mb->mv.x>>2) + x_off + 8;
1327
        int my = (mb->mv.y>>2) + y_off;
1328
        uint8_t **src= s->framep[ref]->data;
1329
        int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1330
        s->dsp.prefetch(src[0]+off, s->linesize, 4);
1331
        off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1332
        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1333
    }
1334
}
1335

    
1336
/**
1337
 * Apply motion vectors to prediction buffer, chapter 18.
1338
 */
1339
static av_always_inline
1340
void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1341
                   int mb_x, int mb_y)
1342
{
1343
    int x_off = mb_x << 4, y_off = mb_y << 4;
1344
    int width = 16*s->mb_width, height = 16*s->mb_height;
1345
    AVFrame *ref = s->framep[mb->ref_frame];
1346
    VP56mv *bmv = mb->bmv;
1347

    
1348
    switch (mb->partitioning) {
1349
    case VP8_SPLITMVMODE_NONE:
1350
        vp8_mc_part(s, dst, ref, x_off, y_off,
1351
                    0, 0, 16, 16, width, height, &mb->mv);
1352
        break;
1353
    case VP8_SPLITMVMODE_4x4: {
1354
        int x, y;
1355
        VP56mv uvmv;
1356

    
1357
        /* Y */
1358
        for (y = 0; y < 4; y++) {
1359
            for (x = 0; x < 4; x++) {
1360
                vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1361
                            ref->data[0], &bmv[4*y + x],
1362
                            4*x + x_off, 4*y + y_off, 4, 4,
1363
                            width, height, s->linesize,
1364
                            s->put_pixels_tab[2]);
1365
            }
1366
        }
1367

    
1368
        /* U/V */
1369
        x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1370
        for (y = 0; y < 2; y++) {
1371
            for (x = 0; x < 2; x++) {
1372
                uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1373
                         mb->bmv[ 2*y    * 4 + 2*x+1].x +
1374
                         mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1375
                         mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1376
                uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1377
                         mb->bmv[ 2*y    * 4 + 2*x+1].y +
1378
                         mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1379
                         mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1380
                uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1381
                uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1382
                if (s->profile == 3) {
1383
                    uvmv.x &= ~7;
1384
                    uvmv.y &= ~7;
1385
                }
1386
                vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1387
                              dst[2] + 4*y*s->uvlinesize + x*4,
1388
                              ref->data[1], ref->data[2], &uvmv,
1389
                              4*x + x_off, 4*y + y_off, 4, 4,
1390
                              width, height, s->uvlinesize,
1391
                              s->put_pixels_tab[2]);
1392
            }
1393
        }
1394
        break;
1395
    }
1396
    case VP8_SPLITMVMODE_16x8:
1397
        vp8_mc_part(s, dst, ref, x_off, y_off,
1398
                    0, 0, 16, 8, width, height, &bmv[0]);
1399
        vp8_mc_part(s, dst, ref, x_off, y_off,
1400
                    0, 8, 16, 8, width, height, &bmv[1]);
1401
        break;
1402
    case VP8_SPLITMVMODE_8x16:
1403
        vp8_mc_part(s, dst, ref, x_off, y_off,
1404
                    0, 0, 8, 16, width, height, &bmv[0]);
1405
        vp8_mc_part(s, dst, ref, x_off, y_off,
1406
                    8, 0, 8, 16, width, height, &bmv[1]);
1407
        break;
1408
    case VP8_SPLITMVMODE_8x8:
1409
        vp8_mc_part(s, dst, ref, x_off, y_off,
1410
                    0, 0, 8, 8, width, height, &bmv[0]);
1411
        vp8_mc_part(s, dst, ref, x_off, y_off,
1412
                    8, 0, 8, 8, width, height, &bmv[1]);
1413
        vp8_mc_part(s, dst, ref, x_off, y_off,
1414
                    0, 8, 8, 8, width, height, &bmv[2]);
1415
        vp8_mc_part(s, dst, ref, x_off, y_off,
1416
                    8, 8, 8, 8, width, height, &bmv[3]);
1417
        break;
1418
    }
1419
}
1420

    
1421
static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1422
{
1423
    int x, y, ch;
1424

    
1425
    if (mb->mode != MODE_I4x4) {
1426
        uint8_t *y_dst = dst[0];
1427
        for (y = 0; y < 4; y++) {
1428
            uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1429
            if (nnz4) {
1430
                if (nnz4&~0x01010101) {
1431
                    for (x = 0; x < 4; x++) {
1432
                        if ((uint8_t)nnz4 == 1)
1433
                            s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1434
                        else if((uint8_t)nnz4 > 1)
1435
                            s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1436
                        nnz4 >>= 8;
1437
                        if (!nnz4)
1438
                            break;
1439
                    }
1440
                } else {
1441
                    s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1442
                }
1443
            }
1444
            y_dst += 4*s->linesize;
1445
        }
1446
    }
1447

    
1448
    for (ch = 0; ch < 2; ch++) {
1449
        uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1450
        if (nnz4) {
1451
            uint8_t *ch_dst = dst[1+ch];
1452
            if (nnz4&~0x01010101) {
1453
                for (y = 0; y < 2; y++) {
1454
                    for (x = 0; x < 2; x++) {
1455
                        if ((uint8_t)nnz4 == 1)
1456
                            s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1457
                        else if((uint8_t)nnz4 > 1)
1458
                            s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1459
                        nnz4 >>= 8;
1460
                        if (!nnz4)
1461
                            break;
1462
                    }
1463
                    ch_dst += 4*s->uvlinesize;
1464
                }
1465
            } else {
1466
                s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1467
            }
1468
        }
1469
    }
1470
}
1471

    
1472
static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1473
{
1474
    int interior_limit, filter_level;
1475

    
1476
    if (s->segmentation.enabled) {
1477
        filter_level = s->segmentation.filter_level[s->segment];
1478
        if (!s->segmentation.absolute_vals)
1479
            filter_level += s->filter.level;
1480
    } else
1481
        filter_level = s->filter.level;
1482

    
1483
    if (s->lf_delta.enabled) {
1484
        filter_level += s->lf_delta.ref[mb->ref_frame];
1485
        filter_level += s->lf_delta.mode[mb->mode];
1486
    }
1487

    
1488
/* Like av_clip for inputs 0 and max, where max is equal to (2^n-1) */
1489
#define POW2CLIP(x,max) (((x) & ~max) ? (-(x))>>31 & max : (x));
1490
    filter_level = POW2CLIP(filter_level, 63);
1491

    
1492
    interior_limit = filter_level;
1493
    if (s->filter.sharpness) {
1494
        interior_limit >>= (s->filter.sharpness + 3) >> 2;
1495
        interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1496
    }
1497
    interior_limit = FFMAX(interior_limit, 1);
1498

    
1499
    f->filter_level = filter_level;
1500
    f->inner_limit = interior_limit;
1501
    f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1502
}
1503

    
1504
static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1505
{
1506
    int mbedge_lim, bedge_lim, hev_thresh;
1507
    int filter_level = f->filter_level;
1508
    int inner_limit = f->inner_limit;
1509
    int inner_filter = f->inner_filter;
1510
    int linesize = s->linesize;
1511
    int uvlinesize = s->uvlinesize;
1512
    static const uint8_t hev_thresh_lut[2][64] = {
1513
        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1514
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1515
          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1516
          3, 3, 3, 3 },
1517
        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1518
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1519
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1520
          2, 2, 2, 2 }
1521
    };
1522

    
1523
    if (!filter_level)
1524
        return;
1525

    
1526
     bedge_lim = 2*filter_level + inner_limit;
1527
    mbedge_lim = bedge_lim + 4;
1528

    
1529
    hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1530

    
1531
    if (mb_x) {
1532
        s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1533
                                       mbedge_lim, inner_limit, hev_thresh);
1534
        s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1535
                                       mbedge_lim, inner_limit, hev_thresh);
1536
    }
1537

    
1538
    if (inner_filter) {
1539
        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1540
                                             inner_limit, hev_thresh);
1541
        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1542
                                             inner_limit, hev_thresh);
1543
        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1544
                                             inner_limit, hev_thresh);
1545
        s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1546
                                             uvlinesize,  bedge_lim,
1547
                                             inner_limit, hev_thresh);
1548
    }
1549

    
1550
    if (mb_y) {
1551
        s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1552
                                       mbedge_lim, inner_limit, hev_thresh);
1553
        s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1554
                                       mbedge_lim, inner_limit, hev_thresh);
1555
    }
1556

    
1557
    if (inner_filter) {
1558
        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1559
                                             linesize,    bedge_lim,
1560
                                             inner_limit, hev_thresh);
1561
        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1562
                                             linesize,    bedge_lim,
1563
                                             inner_limit, hev_thresh);
1564
        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1565
                                             linesize,    bedge_lim,
1566
                                             inner_limit, hev_thresh);
1567
        s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1568
                                             dst[2] + 4 * uvlinesize,
1569
                                             uvlinesize,  bedge_lim,
1570
                                             inner_limit, hev_thresh);
1571
    }
1572
}
1573

    
1574
static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1575
{
1576
    int mbedge_lim, bedge_lim;
1577
    int filter_level = f->filter_level;
1578
    int inner_limit = f->inner_limit;
1579
    int inner_filter = f->inner_filter;
1580
    int linesize = s->linesize;
1581

    
1582
    if (!filter_level)
1583
        return;
1584

    
1585
     bedge_lim = 2*filter_level + inner_limit;
1586
    mbedge_lim = bedge_lim + 4;
1587

    
1588
    if (mb_x)
1589
        s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1590
    if (inner_filter) {
1591
        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1592
        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1593
        s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1594
    }
1595

    
1596
    if (mb_y)
1597
        s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1598
    if (inner_filter) {
1599
        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1600
        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1601
        s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1602
    }
1603
}
1604

    
1605
static void filter_mb_row(VP8Context *s, int mb_y)
1606
{
1607
    VP8FilterStrength *f = s->filter_strength;
1608
    uint8_t *dst[3] = {
1609
        s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize,
1610
        s->framep[VP56_FRAME_CURRENT]->data[1] +  8*mb_y*s->uvlinesize,
1611
        s->framep[VP56_FRAME_CURRENT]->data[2] +  8*mb_y*s->uvlinesize
1612
    };
1613
    int mb_x;
1614

    
1615
    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1616
        backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1617
        filter_mb(s, dst, f++, mb_x, mb_y);
1618
        dst[0] += 16;
1619
        dst[1] += 8;
1620
        dst[2] += 8;
1621
    }
1622
}
1623

    
1624
static void filter_mb_row_simple(VP8Context *s, int mb_y)
1625
{
1626
    VP8FilterStrength *f = s->filter_strength;
1627
    uint8_t *dst = s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize;
1628
    int mb_x;
1629

    
1630
    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1631
        backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1632
        filter_mb_simple(s, dst, f++, mb_x, mb_y);
1633
        dst += 16;
1634
    }
1635
}
1636

    
1637
static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1638
                            AVPacket *avpkt)
1639
{
1640
    VP8Context *s = avctx->priv_data;
1641
    int ret, mb_x, mb_y, i, y, referenced;
1642
    enum AVDiscard skip_thresh;
1643
    AVFrame *av_uninit(curframe);
1644

    
1645
    if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1646
        return ret;
1647

    
1648
    referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1649
                                || s->update_altref == VP56_FRAME_CURRENT;
1650

    
1651
    skip_thresh = !referenced ? AVDISCARD_NONREF :
1652
                    !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1653

    
1654
    if (avctx->skip_frame >= skip_thresh) {
1655
        s->invisible = 1;
1656
        goto skip_decode;
1657
    }
1658
    s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1659

    
1660
    for (i = 0; i < 4; i++)
1661
        if (&s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1662
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1663
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1664
            curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1665
            break;
1666
        }
1667
    if (curframe->data[0])
1668
        avctx->release_buffer(avctx, curframe);
1669

    
1670
    curframe->key_frame = s->keyframe;
1671
    curframe->pict_type = s->keyframe ? FF_I_TYPE : FF_P_TYPE;
1672
    curframe->reference = referenced ? 3 : 0;
1673
    if ((ret = avctx->get_buffer(avctx, curframe))) {
1674
        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1675
        return ret;
1676
    }
1677

    
1678
    // Given that arithmetic probabilities are updated every frame, it's quite likely
1679
    // that the values we have on a random interframe are complete junk if we didn't
1680
    // start decode on a keyframe. So just don't display anything rather than junk.
1681
    if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1682
                         !s->framep[VP56_FRAME_GOLDEN] ||
1683
                         !s->framep[VP56_FRAME_GOLDEN2])) {
1684
        av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1685
        return AVERROR_INVALIDDATA;
1686
    }
1687

    
1688
    s->linesize   = curframe->linesize[0];
1689
    s->uvlinesize = curframe->linesize[1];
1690

    
1691
    if (!s->edge_emu_buffer)
1692
        s->edge_emu_buffer = av_malloc(21*s->linesize);
1693

    
1694
    memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1695

    
1696
    /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1697
    memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1698

    
1699
    // top edge of 127 for intra prediction
1700
    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1701
        s->top_border[0][15] = s->top_border[0][23] = 127;
1702
        memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1703
    }
1704
    memset(s->ref_count, 0, sizeof(s->ref_count));
1705
    if (s->keyframe)
1706
        memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1707

    
1708
    #define MARGIN (16 << 2)
1709
    s->mv_min.y = -MARGIN;
1710
    s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1711

    
1712
    for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1713
        VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1714
        VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1715
        int mb_xy = mb_y*s->mb_width;
1716
        uint8_t *dst[3] = {
1717
            curframe->data[0] + 16*mb_y*s->linesize,
1718
            curframe->data[1] +  8*mb_y*s->uvlinesize,
1719
            curframe->data[2] +  8*mb_y*s->uvlinesize
1720
        };
1721

    
1722
        memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1723
        memset(s->left_nnz, 0, sizeof(s->left_nnz));
1724
        AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1725

    
1726
        // left edge of 129 for intra prediction
1727
        if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1728
            for (i = 0; i < 3; i++)
1729
                for (y = 0; y < 16>>!!i; y++)
1730
                    dst[i][y*curframe->linesize[i]-1] = 129;
1731
            if (mb_y == 1) // top left edge is also 129
1732
                s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1733
        }
1734

    
1735
        s->mv_min.x = -MARGIN;
1736
        s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1737

    
1738
        for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1739
            /* Prefetch the current frame, 4 MBs ahead */
1740
            s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1741
            s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1742

    
1743
            decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy);
1744

    
1745
            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1746

    
1747
            if (!mb->skip)
1748
                decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1749

    
1750
            if (mb->mode <= MODE_I4x4)
1751
                intra_predict(s, dst, mb, mb_x, mb_y);
1752
            else
1753
                inter_predict(s, dst, mb, mb_x, mb_y);
1754

    
1755
            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1756

    
1757
            if (!mb->skip) {
1758
                idct_mb(s, dst, mb);
1759
            } else {
1760
                AV_ZERO64(s->left_nnz);
1761
                AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1762

    
1763
                // Reset DC block predictors if they would exist if the mb had coefficients
1764
                if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1765
                    s->left_nnz[8]      = 0;
1766
                    s->top_nnz[mb_x][8] = 0;
1767
                }
1768
            }
1769

    
1770
            if (s->deblock_filter)
1771
                filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1772

    
1773
            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1774

    
1775
            dst[0] += 16;
1776
            dst[1] += 8;
1777
            dst[2] += 8;
1778
            s->mv_min.x -= 64;
1779
            s->mv_max.x -= 64;
1780
        }
1781
        if (s->deblock_filter) {
1782
            if (s->filter.simple)
1783
                filter_mb_row_simple(s, mb_y);
1784
            else
1785
                filter_mb_row(s, mb_y);
1786
        }
1787
        s->mv_min.y -= 64;
1788
        s->mv_max.y -= 64;
1789
    }
1790

    
1791
skip_decode:
1792
    // if future frames don't use the updated probabilities,
1793
    // reset them to the values we saved
1794
    if (!s->update_probabilities)
1795
        s->prob[0] = s->prob[1];
1796

    
1797
    // check if golden and altref are swapped
1798
    if (s->update_altref == VP56_FRAME_GOLDEN &&
1799
        s->update_golden == VP56_FRAME_GOLDEN2)
1800
        FFSWAP(AVFrame *, s->framep[VP56_FRAME_GOLDEN], s->framep[VP56_FRAME_GOLDEN2]);
1801
    else {
1802
        if (s->update_altref != VP56_FRAME_NONE)
1803
            s->framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
1804

    
1805
        if (s->update_golden != VP56_FRAME_NONE)
1806
            s->framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
1807
    }
1808

    
1809
    if (s->update_last) // move cur->prev
1810
        s->framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_CURRENT];
1811

    
1812
    // release no longer referenced frames
1813
    for (i = 0; i < 4; i++)
1814
        if (s->frames[i].data[0] &&
1815
            &s->frames[i] != s->framep[VP56_FRAME_CURRENT] &&
1816
            &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1817
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1818
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1819
            avctx->release_buffer(avctx, &s->frames[i]);
1820

    
1821
    if (!s->invisible) {
1822
        *(AVFrame*)data = *s->framep[VP56_FRAME_CURRENT];
1823
        *data_size = sizeof(AVFrame);
1824
    }
1825

    
1826
    return avpkt->size;
1827
}
1828

    
1829
static av_cold int vp8_decode_init(AVCodecContext *avctx)
1830
{
1831
    VP8Context *s = avctx->priv_data;
1832

    
1833
    s->avctx = avctx;
1834
    avctx->pix_fmt = PIX_FMT_YUV420P;
1835

    
1836
    dsputil_init(&s->dsp, avctx);
1837
    ff_h264_pred_init(&s->hpc, CODEC_ID_VP8);
1838
    ff_vp8dsp_init(&s->vp8dsp);
1839

    
1840
    return 0;
1841
}
1842

    
1843
static av_cold int vp8_decode_free(AVCodecContext *avctx)
1844
{
1845
    vp8_decode_flush(avctx);
1846
    return 0;
1847
}
1848

    
1849
AVCodec ff_vp8_decoder = {
1850
    "vp8",
1851
    AVMEDIA_TYPE_VIDEO,
1852
    CODEC_ID_VP8,
1853
    sizeof(VP8Context),
1854
    vp8_decode_init,
1855
    NULL,
1856
    vp8_decode_free,
1857
    vp8_decode_frame,
1858
    CODEC_CAP_DR1,
1859
    .flush = vp8_decode_flush,
1860
    .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1861
};