Statistics
| Branch: | Revision:

ffmpeg / doc / snow.txt @ 22b6a24c

History | View | Annotate | Download (20.9 KB)

1
=============================================
2
Snow Video Codec Specification Draft 20080110
3
=============================================
4

    
5
Introduction:
6
=============
7
This specification describes the Snow bitstream syntax and semantics as
8
well as the formal Snow decoding process.
9

    
10
The decoding process is described precisely and any compliant decoder
11
MUST produce the exact same output for a spec-conformant Snow stream.
12
For encoding, though, any process which generates a stream compliant to
13
the syntactical and semantic requirements and which is decodable by
14
the process described in this spec shall be considered a conformant
15
Snow encoder.
16

    
17
Definitions:
18
============
19

    
20
MUST    the specific part must be done to conform to this standard
21
SHOULD  it is recommended to be done that way, but not strictly required
22

    
23
ilog2(x) is the rounded down logarithm of x with basis 2
24
ilog2(0) = 0
25

    
26
Type definitions:
27
=================
28

    
29
b   1-bit range coded
30
u   unsigned scalar value range coded
31
s   signed scalar value range coded
32

    
33

    
34
Bitstream syntax:
35
=================
36

    
37
frame:
38
    header
39
    prediction
40
    residual
41

    
42
header:
43
    keyframe                            b   MID_STATE
44
    if(keyframe || always_reset)
45
        reset_contexts
46
    if(keyframe){
47
        version                         u   header_state
48
        always_reset                    b   header_state
49
        temporal_decomposition_type     u   header_state
50
        temporal_decomposition_count    u   header_state
51
        spatial_decomposition_count     u   header_state
52
        colorspace_type                 u   header_state
53
        chroma_h_shift                  u   header_state
54
        chroma_v_shift                  u   header_state
55
        spatial_scalability             b   header_state
56
        max_ref_frames-1                u   header_state
57
        qlogs
58
    }
59
    if(!keyframe){
60
        update_mc                       b   header_state
61
        if(update_mc){
62
            for(plane=0; plane<2; plane++){
63
                diag_mc                 b   header_state
64
                htaps/2-1               u   header_state
65
                for(i= p->htaps/2; i; i--)
66
                    |hcoeff[i]|         u   header_state
67
            }
68
        }
69
        update_qlogs                    b   header_state
70
        if(update_qlogs){
71
            spatial_decomposition_count u   header_state
72
            qlogs
73
        }
74
    }
75

    
76
    spatial_decomposition_type          s   header_state
77
    qlog                                s   header_state
78
    mv_scale                            s   header_state
79
    qbias                               s   header_state
80
    block_max_depth                     s   header_state
81

    
82
qlogs:
83
    for(plane=0; plane<2; plane++){
84
        quant_table[plane][0][0]        s   header_state
85
        for(level=0; level < spatial_decomposition_count; level++){
86
            quant_table[plane][level][1]s   header_state
87
            quant_table[plane][level][3]s   header_state
88
        }
89
    }
90

    
91
reset_contexts
92
    *_state[*]= MID_STATE
93

    
94
prediction:
95
    for(y=0; y<block_count_vertical; y++)
96
        for(x=0; x<block_count_horizontal; x++)
97
            block(0)
98

    
99
block(level):
100
    mvx_diff=mvy_diff=y_diff=cb_diff=cr_diff=0
101
    if(keyframe){
102
        intra=1
103
    }else{
104
        if(level!=max_block_depth){
105
            s_context= 2*left->level + 2*top->level + topleft->level + topright->level
106
            leaf                        b   block_state[4 + s_context]
107
        }
108
        if(level==max_block_depth || leaf){
109
            intra                       b   block_state[1 + left->intra + top->intra]
110
            if(intra){
111
                y_diff                  s   block_state[32]
112
                cb_diff                 s   block_state[64]
113
                cr_diff                 s   block_state[96]
114
            }else{
115
                ref_context= ilog2(2*left->ref) + ilog2(2*top->ref)
116
                if(ref_frames > 1)
117
                    ref                 u   block_state[128 + 1024 + 32*ref_context]
118
                mx_context= ilog2(2*abs(left->mx - top->mx))
119
                my_context= ilog2(2*abs(left->my - top->my))
120
                mvx_diff                s   block_state[128 + 32*(mx_context + 16*!!ref)]
121
                mvy_diff                s   block_state[128 + 32*(my_context + 16*!!ref)]
122
            }
123
        }else{
124
            block(level+1)
125
            block(level+1)
126
            block(level+1)
127
            block(level+1)
128
        }
129
    }
130

    
131

    
132
residual:
133
    residual2(luma)
134
    residual2(chroma_cr)
135
    residual2(chroma_cb)
136

    
137
residual2:
138
    for(level=0; level<spatial_decomposition_count; level++){
139
        if(level==0)
140
            subband(LL, 0)
141
        subband(HL, level)
142
        subband(LH, level)
143
        subband(HH, level)
144
    }
145

    
146
subband:
147
    FIXME
148

    
149

    
150

    
151
Tag description:
152
----------------
153

    
154
version
155
    0
156
    this MUST NOT change within a bitstream
157

    
158
always_reset
159
    if 1 then the range coder contexts will be reset after each frame
160

    
161
temporal_decomposition_type
162
    0
163

    
164
temporal_decomposition_count
165
    0
166

    
167
spatial_decomposition_count
168
    FIXME
169

    
170
colorspace_type
171
    0
172
    this MUST NOT change within a bitstream
173

    
174
chroma_h_shift
175
    log2(luma.width / chroma.width)
176
    this MUST NOT change within a bitstream
177

    
178
chroma_v_shift
179
    log2(luma.height / chroma.height)
180
    this MUST NOT change within a bitstream
181

    
182
spatial_scalability
183
    0
184

    
185
max_ref_frames
186
    maximum number of reference frames
187
    this MUST NOT change within a bitstream
188

    
189
update_mc
190
    indicates that motion compensation filter parameters are stored in the
191
    header
192

    
193
diag_mc
194
    flag to enable faster diagonal interpolation
195
    this SHOULD be 1 unless it turns out to be covered by a valid patent
196

    
197
htaps
198
    number of half pel interpolation filter taps, MUST be even, >0 and <10
199

    
200
hcoeff
201
    half pel interpolation filter coefficients, hcoeff[0] are the 2 middle
202
    coefficients [1] are the next outer ones and so on, resulting in a filter
203
    like: ...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ...
204
    the sign of the coefficients is not explicitly stored but alternates
205
    after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,...
206
    hcoeff[0] is not explicitly stored but found by subtracting the sum
207
    of all stored coefficients with signs from 32
208
    hcoeff[0]= 32 - hcoeff[1] - hcoeff[2] - ...
209
    a good choice for hcoeff and htaps is
210
    htaps= 6
211
    hcoeff={40,-10,2}
212
    an alternative which requires more computations at both encoder and
213
    decoder side and may or may not be better is
214
    htaps= 8
215
    hcoeff={42,-14,6,-2}
216

    
217

    
218
ref_frames
219
    minimum of the number of available reference frames and max_ref_frames
220
    for example the first frame after a key frame always has ref_frames=1
221

    
222
spatial_decomposition_type
223
    wavelet type
224
    0 is a 9/7 symmetric compact integer wavelet
225
    1 is a 5/3 symmetric compact integer wavelet
226
    others are reserved
227
    stored as delta from last, last is reset to 0 if always_reset || keyframe
228

    
229
qlog
230
    quality (logarthmic quantizer scale)
231
    stored as delta from last, last is reset to 0 if always_reset || keyframe
232

    
233
mv_scale
234
    stored as delta from last, last is reset to 0 if always_reset || keyframe
235
    FIXME check that everything works fine if this changes between frames
236

    
237
qbias
238
    dequantization bias
239
    stored as delta from last, last is reset to 0 if always_reset || keyframe
240

    
241
block_max_depth
242
    maximum depth of the block tree
243
    stored as delta from last, last is reset to 0 if always_reset || keyframe
244

    
245
quant_table
246
    quantiztation table
247

    
248

    
249
Highlevel bitstream structure:
250
=============================
251
 --------------------------------------------
252
|                   Header                   |
253
 --------------------------------------------
254
|    ------------------------------------    |
255
|   |               Block0               |   |
256
|   |             split?                 |   |
257
|   |     yes              no            |   |
258
|   |  .........         intra?          |   |
259
|   | : Block01 :    yes         no      |   |
260
|   | : Block02 :  .......   ..........  |   |
261
|   | : Block03 : :  y DC : : ref index: |   |
262
|   | : Block04 : : cb DC : : motion x : |   |
263
|   |  .........  : cr DC : : motion y : |   |
264
|   |              .......   ..........  |   |
265
|    ------------------------------------    |
266
|    ------------------------------------    |
267
|   |               Block1               |   |
268
|                    ...                     |
269
 --------------------------------------------
270
| ------------   ------------   ------------ |
271
|| Y subbands | | Cb subbands| | Cr subbands||
272
||  ---  ---  | |  ---  ---  | |  ---  ---  ||
273
|| |LL0||HL0| | | |LL0||HL0| | | |LL0||HL0| ||
274
||  ---  ---  | |  ---  ---  | |  ---  ---  ||
275
||  ---  ---  | |  ---  ---  | |  ---  ---  ||
276
|| |LH0||HH0| | | |LH0||HH0| | | |LH0||HH0| ||
277
||  ---  ---  | |  ---  ---  | |  ---  ---  ||
278
||  ---  ---  | |  ---  ---  | |  ---  ---  ||
279
|| |HL1||LH1| | | |HL1||LH1| | | |HL1||LH1| ||
280
||  ---  ---  | |  ---  ---  | |  ---  ---  ||
281
||  ---  ---  | |  ---  ---  | |  ---  ---  ||
282
|| |HH1||HL2| | | |HH1||HL2| | | |HH1||HL2| ||
283
||    ...     | |    ...     | |    ...     ||
284
| ------------   ------------   ------------ |
285
 --------------------------------------------
286

    
287
Decoding process:
288
=================
289

    
290
                                         ------------
291
                                        |            |
292
                                        |  Subbands  |
293
                   ------------         |            |
294
                  |            |         ------------
295
                  |  Intra DC  |               |
296
                  |            |    LL0 subband prediction
297
                   ------------                |
298
                                \        Dequantizaton
299
 -------------------             \             |
300
|  Reference frames |             \           IDWT
301
| -------   ------- |    Motion    \           |
302
||Frame 0| |Frame 1|| Compensation  .   OBMC   v      -------
303
| -------   ------- | --------------. \------> + --->|Frame n|-->output
304
| -------   ------- |                                 -------
305
||Frame 2| |Frame 3||<----------------------------------/
306
|        ...        |
307
 -------------------
308

    
309

    
310
Range Coder:
311
============
312

    
313
Binary Range Coder:
314
-------------------
315
The implemented range coder is an adapted version based upon "Range encoding:
316
an algorithm for removing redundancy from a digitised message." by G. N. N.
317
Martin.
318
The symbols encoded by the Snow range coder are bits (0|1). The
319
associated probabilities are not fix but change depending on the symbol mix
320
seen so far.
321

    
322

    
323
bit seen | new state
324
---------+-----------------------------------------------
325
    0    | 256 - state_transition_table[256 - old_state];
326
    1    |       state_transition_table[      old_state];
327

    
328
state_transition_table = {
329
  0,   0,   0,   0,   0,   0,   0,   0,  20,  21,  22,  23,  24,  25,  26,  27,
330
 28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  37,  38,  39,  40,  41,  42,
331
 43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  56,  57,
332
 58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
333
 74,  75,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
334
 89,  90,  91,  92,  93,  94,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
335
104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118,
336
119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133,
337
134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
338
150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
339
165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179,
340
180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194,
341
195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209,
342
210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225,
343
226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240,
344
241, 242, 243, 244, 245, 246, 247, 248, 248,   0,   0,   0,   0,   0,   0,   0};
345

    
346
FIXME
347

    
348

    
349
Range Coding of integers:
350
-------------------------
351
FIXME
352

    
353

    
354
Neighboring Blocks:
355
===================
356
left and top are set to the respective blocks unless they are outside of
357
the image in which case they are set to the Null block
358

    
359
top-left is set to the top left block unless it is outside of the image in
360
which case it is set to the left block
361

    
362
if this block has no larger parent block or it is at the left side of its
363
parent block and the top right block is not outside of the image then the
364
top right block is used for top-right else the top-left block is used
365

    
366
Null block
367
y,cb,cr are 128
368
level, ref, mx and my are 0
369

    
370

    
371
Motion Vector Prediction:
372
=========================
373
1. the motion vectors of all the neighboring blocks are scaled to
374
compensate for the difference of reference frames
375

    
376
scaled_mv= (mv * (256 * (current_reference+1) / (mv.reference+1)) + 128)>>8
377

    
378
2. the median of the scaled left, top and top-right vectors is used as
379
motion vector prediction
380

    
381
3. the used motion vector is the sum of the predictor and
382
   (mvx_diff, mvy_diff)*mv_scale
383

    
384

    
385
Intra DC Predicton:
386
======================
387
the luma and chroma values of the left block are used as predictors
388

    
389
the used luma and chroma is the sum of the predictor and y_diff, cb_diff, cr_diff
390
to reverse this in the decoder apply the following:
391
block[y][x].dc[0] = block[y][x-1].dc[0] +  y_diff;
392
block[y][x].dc[1] = block[y][x-1].dc[1] + cb_diff;
393
block[y][x].dc[2] = block[y][x-1].dc[2] + cr_diff;
394
block[*][-1].dc[*]= 128;
395

    
396

    
397
Motion Compensation:
398
====================
399

    
400
Halfpel interpolation:
401
----------------------
402
halfpel interpolation is done by convolution with the halfpel filter stored
403
in the header:
404

    
405
horizontal halfpel samples are found by
406
H1[y][x] =    hcoeff[0]*(F[y][x  ] + F[y][x+1])
407
            + hcoeff[1]*(F[y][x-1] + F[y][x+2])
408
            + hcoeff[2]*(F[y][x-2] + F[y][x+3])
409
            + ...
410
h1[y][x] = (H1[y][x] + 32)>>6;
411

    
412
vertical halfpel samples are found by
413
H2[y][x] =    hcoeff[0]*(F[y  ][x] + F[y+1][x])
414
            + hcoeff[1]*(F[y-1][x] + F[y+2][x])
415
            + ...
416
h2[y][x] = (H2[y][x] + 32)>>6;
417

    
418
vertical+horizontal halfpel samples are found by
419
H3[y][x] =    hcoeff[0]*(H2[y][x  ] + H2[y][x+1])
420
            + hcoeff[1]*(H2[y][x-1] + H2[y][x+2])
421
            + ...
422
H3[y][x] =    hcoeff[0]*(H1[y  ][x] + H1[y+1][x])
423
            + hcoeff[1]*(H1[y+1][x] + H1[y+2][x])
424
            + ...
425
h3[y][x] = (H3[y][x] + 2048)>>12;
426

    
427

    
428
                   F   H1  F
429
                   |   |   |
430
                   |   |   |
431
                   |   |   |
432
                   F   H1  F
433
                   |   |   |
434
                   |   |   |
435
                   |   |   |
436
   F-------F-------F-> H1<-F-------F-------F
437
                   v   v   v
438
                  H2   H3  H2
439
                   ^   ^   ^
440
   F-------F-------F-> H1<-F-------F-------F
441
                   |   |   |
442
                   |   |   |
443
                   |   |   |
444
                   F   H1  F
445
                   |   |   |
446
                   |   |   |
447
                   |   |   |
448
                   F   H1  F
449

    
450

    
451
unavailable fullpel samples (outside the picture for example) shall be equal
452
to the closest available fullpel sample
453

    
454

    
455
Smaller pel interpolation:
456
--------------------------
457
if diag_mc is set then points which lie on a line between 2 vertically,
458
horiziontally or diagonally adjacent halfpel points shall be interpolated
459
linearls with rounding to nearest and halfway values rounded up.
460
points which lie on 2 diagonals at the same time should only use the one
461
diagonal not containing the fullpel point
462

    
463

    
464

    
465
           F-->O---q---O<--h1->O---q---O<--F
466
           v \           / v \           / v
467
           O   O       O   O   O       O   O
468
           |         /     |     \         |
469
           q       q       q       q       q
470
           |     /         |         \     |
471
           O   O       O   O   O       O   O
472
           ^ /           \ ^ /           \ ^
473
          h2-->O---q---O<--h3->O---q---O<--h2
474
           v \           / v \           / v
475
           O   O       O   O   O       O   O
476
           |     \         |         /     |
477
           q       q       q       q       q
478
           |         \     |     /         |
479
           O   O       O   O   O       O   O
480
           ^ /           \ ^ /           \ ^
481
           F-->O---q---O<--h1->O---q---O<--F
482

    
483

    
484

    
485
the remaining points shall be bilinearly interpolated from the
486
up to 4 surrounding halfpel and fullpel points, again rounding should be to
487
nearest and halfway values rounded up
488

    
489
compliant Snow decoders MUST support 1-1/8 pel luma and 1/2-1/16 pel chroma
490
interpolation at least
491

    
492

    
493
Overlapped block motion compensation:
494
-------------------------------------
495
FIXME
496

    
497
LL band prediction:
498
===================
499
Each sample in the LL0 subband is predicted by the median of the left, top and
500
left+top-topleft samples, samples outside the subband shall be considered to
501
be 0. To reverse this prediction in the decoder apply the following.
502
for(y=0; y<height; y++){
503
    for(x=0; x<width; x++){
504
        sample[y][x] += median(sample[y-1][x],
505
                               sample[y][x-1],
506
                               sample[y-1][x]+sample[y][x-1]-sample[y-1][x-1]);
507
    }
508
}
509
sample[-1][*]=sample[*][-1]= 0;
510
width,height here are the width and height of the LL0 subband not of the final
511
video
512

    
513

    
514
Dequantizaton:
515
==============
516
FIXME
517

    
518
Wavelet Transform:
519
==================
520

    
521
Snow supports 2 wavelet transforms, the symmetric biorthogonal 5/3 integer
522
transform and a integer approximation of the symmetric biorthogonal 9/7
523
daubechies wavelet.
524

    
525
2D IDWT (inverse discrete wavelet transform)
526
--------------------------------------------
527
The 2D IDWT applies a 2D filter recursively, each time combining the
528
4 lowest frequency subbands into a single subband until only 1 subband
529
remains.
530
The 2D filter is done by first applying a 1D filter in the vertical direction
531
and then applying it in the horizontal one.
532
 ---------------    ---------------    ---------------    ---------------
533
|LL0|HL0|       |  |   |   |       |  |       |       |  |       |       |
534
|---+---|  HL1  |  | L0|H0 |  HL1  |  |  LL1  |  HL1  |  |       |       |
535
|LH0|HH0|       |  |   |   |       |  |       |       |  |       |       |
536
|-------+-------|->|-------+-------|->|-------+-------|->|   L1  |  H1   |->...
537
|       |       |  |       |       |  |       |       |  |       |       |
538
|  LH1  |  HH1  |  |  LH1  |  HH1  |  |  LH1  |  HH1  |  |       |       |
539
|       |       |  |       |       |  |       |       |  |       |       |
540
 ---------------    ---------------    ---------------    ---------------
541

    
542

    
543
1D Filter:
544
----------
545
1. interleave the samples of the low and high frequency subbands like
546
s={L0, H0, L1, H1, L2, H2, L3, H3, ... }
547
note, this can end with a L or a H, the number of elements shall be w
548
s[-1] shall be considered equivalent to s[1  ]
549
s[w ] shall be considered equivalent to s[w-2]
550

    
551
2. perform the lifting steps in order as described below
552

    
553
5/3 Integer filter:
554
1. s[i] -= (s[i-1] + s[i+1] + 2)>>2; for all even i < w
555
2. s[i] += (s[i-1] + s[i+1]    )>>1; for all odd  i < w
556

    
557
\ | /|\ | /|\ | /|\ | /|\
558
 \|/ | \|/ | \|/ | \|/ |
559
  +  |  +  |  +  |  +  |   -1/4
560
 /|\ | /|\ | /|\ | /|\ |
561
/ | \|/ | \|/ | \|/ | \|/
562
  |  +  |  +  |  +  |  +   +1/2
563

    
564

    
565
Snow's 9/7 Integer filter:
566
1. s[i] -= (3*(s[i-1] + s[i+1])         + 4)>>3; for all even i < w
567
2. s[i] -=     s[i-1] + s[i+1]                 ; for all odd  i < w
568
3. s[i] += (   s[i-1] + s[i+1] + 4*s[i] + 8)>>4; for all even i < w
569
4. s[i] += (3*(s[i-1] + s[i+1])            )>>1; for all odd  i < w
570

    
571
\ | /|\ | /|\ | /|\ | /|\
572
 \|/ | \|/ | \|/ | \|/ |
573
  +  |  +  |  +  |  +  |   -3/8
574
 /|\ | /|\ | /|\ | /|\ |
575
/ | \|/ | \|/ | \|/ | \|/
576
 (|  + (|  + (|  + (|  +   -1
577
\ + /|\ + /|\ + /|\ + /|\  +1/4
578
 \|/ | \|/ | \|/ | \|/ |
579
  +  |  +  |  +  |  +  |   +1/16
580
 /|\ | /|\ | /|\ | /|\ |
581
/ | \|/ | \|/ | \|/ | \|/
582
  |  +  |  +  |  +  |  +   +3/2
583

    
584
optimization tips:
585
following are exactly identical
586
(3a)>>1 == a + (a>>1)
587
(a + 4b + 8)>>4 == ((a>>2) + b + 2)>>2
588

    
589
16bit implementation note:
590
The IDWT can be implemented with 16bits, but this requires some care to
591
prevent overflows, the following list, lists the minimum number of bits needed
592
for some terms
593
1. lifting step
594
A= s[i-1] + s[i+1]                              16bit
595
3*A + 4                                         18bit
596
A + (A>>1) + 2                                  17bit
597

    
598
3. lifting step
599
s[i-1] + s[i+1]                                 17bit
600

    
601
4. lifiting step
602
3*(s[i-1] + s[i+1])                             17bit
603

    
604

    
605
TODO:
606
=====
607
Important:
608
finetune initial contexts
609
flip wavelet?
610
try to use the wavelet transformed predicted image (motion compensated image) as context for coding the residual coefficients
611
try the MV length as context for coding the residual coefficients
612
use extradata for stuff which is in the keyframes now?
613
the MV median predictor is patented IIRC
614
implement per picture halfpel interpolation
615
try different range coder state transition tables for different contexts
616

    
617
Not Important:
618
compare the 6 tap and 8 tap hpel filters (psnr/bitrate and subjective quality)
619
spatial_scalability b vs u (!= 0 breaks syntax anyway so we can add a u later)
620

    
621

    
622
Credits:
623
========
624
Michael Niedermayer
625
Loren Merritt
626

    
627

    
628
Copyright:
629
==========
630
GPL + GFDL + whatever is needed to make this a RFC