Statistics
| Branch: | Revision:

ffmpeg / libavcodec / snow.c @ 755bfeab

History | View | Annotate | Download (167 KB)

1
/*
2
 * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "avcodec.h"
22
#include "dsputil.h"
23
#include "snow.h"
24

    
25
#include "rangecoder.h"
26

    
27
#include "mpegvideo.h"
28

    
29
#undef NDEBUG
30
#include <assert.h>
31

    
32
static const int8_t quant3[256]={
33
 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
42
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
43
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
44
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
45
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
46
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
47
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
48
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
49
};
50
static const int8_t quant3b[256]={
51
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
60
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
61
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
62
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
63
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
64
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
65
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
66
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
67
};
68
static const int8_t quant3bA[256]={
69
 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
70
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
71
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
72
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
73
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
74
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
75
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
76
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
77
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
78
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
79
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
80
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
81
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
82
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
83
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
84
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
85
};
86
static const int8_t quant5[256]={
87
 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
88
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
91
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
92
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
93
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
94
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
95
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
96
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
97
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
98
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
99
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
100
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
101
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
102
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
103
};
104
static const int8_t quant7[256]={
105
 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107
 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
108
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
109
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
110
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
111
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
112
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
113
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
114
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
115
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
116
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
117
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
118
-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
119
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
120
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
121
};
122
static const int8_t quant9[256]={
123
 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
124
 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
125
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
126
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
127
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
128
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
129
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
130
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
131
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
132
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
133
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
134
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
135
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
136
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
137
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
138
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
139
};
140
static const int8_t quant11[256]={
141
 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
142
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
143
 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
144
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
145
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
146
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
147
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
148
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
149
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
150
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
151
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
152
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
153
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
154
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
155
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
156
-4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
157
};
158
static const int8_t quant13[256]={
159
 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
160
 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
161
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
162
 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
163
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
164
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
165
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
166
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
167
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
168
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
169
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
170
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
171
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
172
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
173
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
174
-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
175
};
176

    
177
#if 0 //64*cubic
178
static const uint8_t obmc32[1024]={
179
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
181
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
182
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
183
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
184
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
185
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
186
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
187
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
188
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
189
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
190
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
191
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
192
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
193
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
194
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
195
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
196
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
197
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
198
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
199
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
200
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
201
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
202
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
203
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
204
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
205
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
206
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
207
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
208
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
209
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
210
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
211
//error:0.000022
212
};
213
static const uint8_t obmc16[256]={
214
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
215
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
216
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
217
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
218
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
219
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
220
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
221
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
222
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
223
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
224
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
225
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
226
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
227
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
228
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
229
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
230
//error:0.000033
231
};
232
#elif 1 // 64*linear
233
static const uint8_t obmc32[1024]={
234
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
235
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
236
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
237
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
238
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
239
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
240
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
241
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
242
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
243
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
244
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
245
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
246
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
247
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
248
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
249
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
250
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
251
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
252
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
253
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
254
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
255
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
256
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
257
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
258
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
259
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
260
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
261
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
262
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
263
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
264
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
265
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
266
 //error:0.000020
267
};
268
static const uint8_t obmc16[256]={
269
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
270
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
271
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
272
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
273
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
274
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
275
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
276
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
277
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
278
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
279
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
280
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
281
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
282
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
283
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
284
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
285
//error:0.000015
286
};
287
#else //64*cos
288
static const uint8_t obmc32[1024]={
289
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
290
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
291
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
292
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
293
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
294
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
295
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
296
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
297
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
298
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
299
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
300
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
301
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
302
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
303
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
304
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
305
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
306
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
307
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
308
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
309
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
310
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
311
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
312
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
313
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
314
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
315
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
316
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
317
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
318
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
319
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
320
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
321
//error:0.000022
322
};
323
static const uint8_t obmc16[256]={
324
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
325
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
326
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
327
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
328
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
329
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
330
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
331
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
332
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
333
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
334
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
335
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
336
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
337
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
338
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
339
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
340
//error:0.000022
341
};
342
#endif
343

    
344
//linear *64
345
static const uint8_t obmc8[64]={
346
  4, 12, 20, 28, 28, 20, 12,  4,
347
 12, 36, 60, 84, 84, 60, 36, 12,
348
 20, 60,100,140,140,100, 60, 20,
349
 28, 84,140,196,196,140, 84, 28,
350
 28, 84,140,196,196,140, 84, 28,
351
 20, 60,100,140,140,100, 60, 20,
352
 12, 36, 60, 84, 84, 60, 36, 12,
353
  4, 12, 20, 28, 28, 20, 12,  4,
354
//error:0.000000
355
};
356

    
357
//linear *64
358
static const uint8_t obmc4[16]={
359
 16, 48, 48, 16,
360
 48,144,144, 48,
361
 48,144,144, 48,
362
 16, 48, 48, 16,
363
//error:0.000000
364
};
365

    
366
static const uint8_t *obmc_tab[4]={
367
    obmc32, obmc16, obmc8, obmc4
368
};
369

    
370
static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
371

    
372
typedef struct BlockNode{
373
    int16_t mx;
374
    int16_t my;
375
    uint8_t ref;
376
    uint8_t color[3];
377
    uint8_t type;
378
//#define TYPE_SPLIT    1
379
#define BLOCK_INTRA   1
380
#define BLOCK_OPT     2
381
//#define TYPE_NOCOLOR  4
382
    uint8_t level; //FIXME merge into type?
383
}BlockNode;
384

    
385
static const BlockNode null_block= { //FIXME add border maybe
386
    .color= {128,128,128},
387
    .mx= 0,
388
    .my= 0,
389
    .ref= 0,
390
    .type= 0,
391
    .level= 0,
392
};
393

    
394
#define LOG2_MB_SIZE 4
395
#define MB_SIZE (1<<LOG2_MB_SIZE)
396

    
397
typedef struct x_and_coeff{
398
    int16_t x;
399
    uint16_t coeff;
400
} x_and_coeff;
401

    
402
typedef struct SubBand{
403
    int level;
404
    int stride;
405
    int width;
406
    int height;
407
    int qlog;                                   ///< log(qscale)/log[2^(1/6)]
408
    DWTELEM *buf;
409
    int buf_x_offset;
410
    int buf_y_offset;
411
    int stride_line; ///< Stride measured in lines, not pixels.
412
    x_and_coeff * x_coeff;
413
    struct SubBand *parent;
414
    uint8_t state[/*7*2*/ 7 + 512][32];
415
}SubBand;
416

    
417
typedef struct Plane{
418
    int width;
419
    int height;
420
    SubBand band[MAX_DECOMPOSITIONS][4];
421
}Plane;
422

    
423
typedef struct SnowContext{
424
//    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
425

    
426
    AVCodecContext *avctx;
427
    RangeCoder c;
428
    DSPContext dsp;
429
    AVFrame new_picture;
430
    AVFrame input_picture;              ///< new_picture with the internal linesizes
431
    AVFrame current_picture;
432
    AVFrame last_picture[MAX_REF_FRAMES];
433
    AVFrame mconly_picture;
434
//     uint8_t q_context[16];
435
    uint8_t header_state[32];
436
    uint8_t block_state[128 + 32*128];
437
    int keyframe;
438
    int always_reset;
439
    int version;
440
    int spatial_decomposition_type;
441
    int last_spatial_decomposition_type;
442
    int temporal_decomposition_type;
443
    int spatial_decomposition_count;
444
    int temporal_decomposition_count;
445
    int max_ref_frames;
446
    int ref_frames;
447
    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
448
    uint32_t *ref_scores[MAX_REF_FRAMES];
449
    DWTELEM *spatial_dwt_buffer;
450
    int colorspace_type;
451
    int chroma_h_shift;
452
    int chroma_v_shift;
453
    int spatial_scalability;
454
    int qlog;
455
    int last_qlog;
456
    int lambda;
457
    int lambda2;
458
    int pass1_rc;
459
    int mv_scale;
460
    int last_mv_scale;
461
    int qbias;
462
    int last_qbias;
463
#define QBIAS_SHIFT 3
464
    int b_width;
465
    int b_height;
466
    int block_max_depth;
467
    int last_block_max_depth;
468
    Plane plane[MAX_PLANES];
469
    BlockNode *block;
470
#define ME_CACHE_SIZE 1024
471
    int me_cache[ME_CACHE_SIZE];
472
    int me_cache_generation;
473
    slice_buffer sb;
474

    
475
    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
476
}SnowContext;
477

    
478
typedef struct {
479
    DWTELEM *b0;
480
    DWTELEM *b1;
481
    DWTELEM *b2;
482
    DWTELEM *b3;
483
    int y;
484
} dwt_compose_t;
485

    
486
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
487
//#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
488

    
489
static void iterative_me(SnowContext *s);
490

    
491
static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
492
{
493
    int i;
494

    
495
    buf->base_buffer = base_buffer;
496
    buf->line_count = line_count;
497
    buf->line_width = line_width;
498
    buf->data_count = max_allocated_lines;
499
    buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
500
    buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
501

    
502
    for (i = 0; i < max_allocated_lines; i++)
503
    {
504
      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
505
    }
506

    
507
    buf->data_stack_top = max_allocated_lines - 1;
508
}
509

    
510
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
511
{
512
    int offset;
513
    DWTELEM * buffer;
514

    
515
//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
516

    
517
    assert(buf->data_stack_top >= 0);
518
//  assert(!buf->line[line]);
519
    if (buf->line[line])
520
        return buf->line[line];
521

    
522
    offset = buf->line_width * line;
523
    buffer = buf->data_stack[buf->data_stack_top];
524
    buf->data_stack_top--;
525
    buf->line[line] = buffer;
526

    
527
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
528

    
529
    return buffer;
530
}
531

    
532
static void slice_buffer_release(slice_buffer * buf, int line)
533
{
534
    int offset;
535
    DWTELEM * buffer;
536

    
537
    assert(line >= 0 && line < buf->line_count);
538
    assert(buf->line[line]);
539

    
540
    offset = buf->line_width * line;
541
    buffer = buf->line[line];
542
    buf->data_stack_top++;
543
    buf->data_stack[buf->data_stack_top] = buffer;
544
    buf->line[line] = NULL;
545

    
546
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
547
}
548

    
549
static void slice_buffer_flush(slice_buffer * buf)
550
{
551
    int i;
552
    for (i = 0; i < buf->line_count; i++)
553
    {
554
        if (buf->line[i])
555
        {
556
//      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
557
            slice_buffer_release(buf, i);
558
        }
559
    }
560
}
561

    
562
static void slice_buffer_destroy(slice_buffer * buf)
563
{
564
    int i;
565
    slice_buffer_flush(buf);
566

    
567
    for (i = buf->data_count - 1; i >= 0; i--)
568
    {
569
        assert(buf->data_stack[i]);
570
        av_freep(&buf->data_stack[i]);
571
    }
572
    assert(buf->data_stack);
573
    av_freep(&buf->data_stack);
574
    assert(buf->line);
575
    av_freep(&buf->line);
576
}
577

    
578
#ifdef __sgi
579
// Avoid a name clash on SGI IRIX
580
#undef qexp
581
#endif
582
#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
583
static uint8_t qexp[QROOT];
584

    
585
static inline int mirror(int v, int m){
586
    while((unsigned)v > (unsigned)m){
587
        v=-v;
588
        if(v<0) v+= 2*m;
589
    }
590
    return v;
591
}
592

    
593
static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
594
    int i;
595

    
596
    if(v){
597
        const int a= FFABS(v);
598
        const int e= av_log2(a);
599
#if 1
600
        const int el= FFMIN(e, 10);
601
        put_rac(c, state+0, 0);
602

    
603
        for(i=0; i<el; i++){
604
            put_rac(c, state+1+i, 1);  //1..10
605
        }
606
        for(; i<e; i++){
607
            put_rac(c, state+1+9, 1);  //1..10
608
        }
609
        put_rac(c, state+1+FFMIN(i,9), 0);
610

    
611
        for(i=e-1; i>=el; i--){
612
            put_rac(c, state+22+9, (a>>i)&1); //22..31
613
        }
614
        for(; i>=0; i--){
615
            put_rac(c, state+22+i, (a>>i)&1); //22..31
616
        }
617

    
618
        if(is_signed)
619
            put_rac(c, state+11 + el, v < 0); //11..21
620
#else
621

    
622
        put_rac(c, state+0, 0);
623
        if(e<=9){
624
            for(i=0; i<e; i++){
625
                put_rac(c, state+1+i, 1);  //1..10
626
            }
627
            put_rac(c, state+1+i, 0);
628

    
629
            for(i=e-1; i>=0; i--){
630
                put_rac(c, state+22+i, (a>>i)&1); //22..31
631
            }
632

    
633
            if(is_signed)
634
                put_rac(c, state+11 + e, v < 0); //11..21
635
        }else{
636
            for(i=0; i<e; i++){
637
                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
638
            }
639
            put_rac(c, state+1+FFMIN(i,9), 0);
640

    
641
            for(i=e-1; i>=0; i--){
642
                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
643
            }
644

    
645
            if(is_signed)
646
                put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
647
        }
648
#endif
649
    }else{
650
        put_rac(c, state+0, 1);
651
    }
652
}
653

    
654
static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
655
    if(get_rac(c, state+0))
656
        return 0;
657
    else{
658
        int i, e, a;
659
        e= 0;
660
        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
661
            e++;
662
        }
663

    
664
        a= 1;
665
        for(i=e-1; i>=0; i--){
666
            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
667
        }
668

    
669
        if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
670
            return -a;
671
        else
672
            return a;
673
    }
674
}
675

    
676
static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
677
    int i;
678
    int r= log2>=0 ? 1<<log2 : 1;
679

    
680
    assert(v>=0);
681
    assert(log2>=-4);
682

    
683
    while(v >= r){
684
        put_rac(c, state+4+log2, 1);
685
        v -= r;
686
        log2++;
687
        if(log2>0) r+=r;
688
    }
689
    put_rac(c, state+4+log2, 0);
690

    
691
    for(i=log2-1; i>=0; i--){
692
        put_rac(c, state+31-i, (v>>i)&1);
693
    }
694
}
695

    
696
static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
697
    int i;
698
    int r= log2>=0 ? 1<<log2 : 1;
699
    int v=0;
700

    
701
    assert(log2>=-4);
702

    
703
    while(get_rac(c, state+4+log2)){
704
        v+= r;
705
        log2++;
706
        if(log2>0) r+=r;
707
    }
708

    
709
    for(i=log2-1; i>=0; i--){
710
        v+= get_rac(c, state+31-i)<<i;
711
    }
712

    
713
    return v;
714
}
715

    
716
static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
717
    const int mirror_left= !highpass;
718
    const int mirror_right= (width&1) ^ highpass;
719
    const int w= (width>>1) - 1 + (highpass & width);
720
    int i;
721

    
722
#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
723
    if(mirror_left){
724
        dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
725
        dst += dst_step;
726
        src += src_step;
727
    }
728

    
729
    for(i=0; i<w; i++){
730
        dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
731
    }
732

    
733
    if(mirror_right){
734
        dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
735
    }
736
}
737

    
738
#ifndef lift5
739
static av_always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
740
    const int mirror_left= !highpass;
741
    const int mirror_right= (width&1) ^ highpass;
742
    const int w= (width>>1) - 1 + (highpass & width);
743
    int i;
744

    
745
    if(mirror_left){
746
        int r= 3*2*ref[0];
747
        r += r>>4;
748
        r += r>>8;
749
        dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
750
        dst += dst_step;
751
        src += src_step;
752
    }
753

    
754
    for(i=0; i<w; i++){
755
        int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
756
        r += r>>4;
757
        r += r>>8;
758
        dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
759
    }
760

    
761
    if(mirror_right){
762
        int r= 3*2*ref[w*ref_step];
763
        r += r>>4;
764
        r += r>>8;
765
        dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
766
    }
767
}
768
#endif
769

    
770
#ifndef liftS
771
static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
772
    const int mirror_left= !highpass;
773
    const int mirror_right= (width&1) ^ highpass;
774
    const int w= (width>>1) - 1 + (highpass & width);
775
    int i;
776

    
777
    assert(shift == 4);
778
#define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
779
    if(mirror_left){
780
        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
781
        dst += dst_step;
782
        src += src_step;
783
    }
784

    
785
    for(i=0; i<w; i++){
786
        dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
787
    }
788

    
789
    if(mirror_right){
790
        dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
791
    }
792
}
793
#endif
794

    
795

    
796
static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
797
    int x, i;
798

    
799
    for(x=start; x<width; x+=2){
800
        int64_t sum=0;
801

    
802
        for(i=0; i<n; i++){
803
            int x2= x + 2*i - n + 1;
804
            if     (x2<     0) x2= -x2;
805
            else if(x2>=width) x2= 2*width-x2-2;
806
            sum += coeffs[i]*(int64_t)dst[x2];
807
        }
808
        if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
809
        else        dst[x] += (sum + (1<<shift)/2)>>shift;
810
    }
811
}
812

    
813
static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
814
    int x, y, i;
815
    for(y=start; y<height; y+=2){
816
        for(x=0; x<width; x++){
817
            int64_t sum=0;
818

    
819
            for(i=0; i<n; i++){
820
                int y2= y + 2*i - n + 1;
821
                if     (y2<      0) y2= -y2;
822
                else if(y2>=height) y2= 2*height-y2-2;
823
                sum += coeffs[i]*(int64_t)dst[x + y2*stride];
824
            }
825
            if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
826
            else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
827
        }
828
    }
829
}
830

    
831
#define SCALEX 1
832
#define LX0 0
833
#define LX1 1
834

    
835
#if 0 // more accurate 9/7
836
#define N1 2
837
#define SHIFT1 14
838
#define COEFFS1 (int[]){-25987,-25987}
839
#define N2 2
840
#define SHIFT2 19
841
#define COEFFS2 (int[]){-27777,-27777}
842
#define N3 2
843
#define SHIFT3 15
844
#define COEFFS3 (int[]){28931,28931}
845
#define N4 2
846
#define SHIFT4 15
847
#define COEFFS4 (int[]){14533,14533}
848
#elif 1 // 13/7 CRF
849
#define N1 4
850
#define SHIFT1 4
851
#define COEFFS1 (int[]){1,-9,-9,1}
852
#define N2 4
853
#define SHIFT2 4
854
#define COEFFS2 (int[]){-1,5,5,-1}
855
#define N3 0
856
#define SHIFT3 1
857
#define COEFFS3 NULL
858
#define N4 0
859
#define SHIFT4 1
860
#define COEFFS4 NULL
861
#elif 1 // 3/5
862
#define LX0 1
863
#define LX1 0
864
#define SCALEX 0.5
865
#define N1 2
866
#define SHIFT1 1
867
#define COEFFS1 (int[]){1,1}
868
#define N2 2
869
#define SHIFT2 2
870
#define COEFFS2 (int[]){-1,-1}
871
#define N3 0
872
#define SHIFT3 0
873
#define COEFFS3 NULL
874
#define N4 0
875
#define SHIFT4 0
876
#define COEFFS4 NULL
877
#elif 1 // 11/5
878
#define N1 0
879
#define SHIFT1 1
880
#define COEFFS1 NULL
881
#define N2 2
882
#define SHIFT2 2
883
#define COEFFS2 (int[]){-1,-1}
884
#define N3 2
885
#define SHIFT3 0
886
#define COEFFS3 (int[]){-1,-1}
887
#define N4 4
888
#define SHIFT4 7
889
#define COEFFS4 (int[]){-5,29,29,-5}
890
#define SCALEX 4
891
#elif 1 // 9/7 CDF
892
#define N1 2
893
#define SHIFT1 7
894
#define COEFFS1 (int[]){-203,-203}
895
#define N2 2
896
#define SHIFT2 12
897
#define COEFFS2 (int[]){-217,-217}
898
#define N3 2
899
#define SHIFT3 7
900
#define COEFFS3 (int[]){113,113}
901
#define N4 2
902
#define SHIFT4 9
903
#define COEFFS4 (int[]){227,227}
904
#define SCALEX 1
905
#elif 1 // 7/5 CDF
906
#define N1 0
907
#define SHIFT1 1
908
#define COEFFS1 NULL
909
#define N2 2
910
#define SHIFT2 2
911
#define COEFFS2 (int[]){-1,-1}
912
#define N3 2
913
#define SHIFT3 0
914
#define COEFFS3 (int[]){-1,-1}
915
#define N4 2
916
#define SHIFT4 4
917
#define COEFFS4 (int[]){3,3}
918
#elif 1 // 9/7 MN
919
#define N1 4
920
#define SHIFT1 4
921
#define COEFFS1 (int[]){1,-9,-9,1}
922
#define N2 2
923
#define SHIFT2 2
924
#define COEFFS2 (int[]){1,1}
925
#define N3 0
926
#define SHIFT3 1
927
#define COEFFS3 NULL
928
#define N4 0
929
#define SHIFT4 1
930
#define COEFFS4 NULL
931
#else // 13/7 CRF
932
#define N1 4
933
#define SHIFT1 4
934
#define COEFFS1 (int[]){1,-9,-9,1}
935
#define N2 4
936
#define SHIFT2 4
937
#define COEFFS2 (int[]){-1,5,5,-1}
938
#define N3 0
939
#define SHIFT3 1
940
#define COEFFS3 NULL
941
#define N4 0
942
#define SHIFT4 1
943
#define COEFFS4 NULL
944
#endif
945
static void horizontal_decomposeX(DWTELEM *b, int width){
946
    DWTELEM temp[width];
947
    const int width2= width>>1;
948
    const int w2= (width+1)>>1;
949
    int x;
950

    
951
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
952
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
953
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
954
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
955

    
956
    for(x=0; x<width2; x++){
957
        temp[x   ]= b[2*x    ];
958
        temp[x+w2]= b[2*x + 1];
959
    }
960
    if(width&1)
961
        temp[x   ]= b[2*x    ];
962
    memcpy(b, temp, width*sizeof(int));
963
}
964

    
965
static void horizontal_composeX(DWTELEM *b, int width){
966
    DWTELEM temp[width];
967
    const int width2= width>>1;
968
    int x;
969
    const int w2= (width+1)>>1;
970

    
971
    memcpy(temp, b, width*sizeof(int));
972
    for(x=0; x<width2; x++){
973
        b[2*x    ]= temp[x   ];
974
        b[2*x + 1]= temp[x+w2];
975
    }
976
    if(width&1)
977
        b[2*x    ]= temp[x   ];
978

    
979
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
980
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
981
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
982
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
983
}
984

    
985
static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
986
    int x, y;
987

    
988
    for(y=0; y<height; y++){
989
        for(x=0; x<width; x++){
990
            buffer[y*stride + x] *= SCALEX;
991
        }
992
    }
993

    
994
    for(y=0; y<height; y++){
995
        horizontal_decomposeX(buffer + y*stride, width);
996
    }
997

    
998
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
999
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
1000
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
1001
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
1002
}
1003

    
1004
static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
1005
    int x, y;
1006

    
1007
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1008
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1009
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1010
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1011

    
1012
    for(y=0; y<height; y++){
1013
        horizontal_composeX(buffer + y*stride, width);
1014
    }
1015

    
1016
    for(y=0; y<height; y++){
1017
        for(x=0; x<width; x++){
1018
            buffer[y*stride + x] /= SCALEX;
1019
        }
1020
    }
1021
}
1022

    
1023
static void horizontal_decompose53i(DWTELEM *b, int width){
1024
    DWTELEM temp[width];
1025
    const int width2= width>>1;
1026
    int x;
1027
    const int w2= (width+1)>>1;
1028

    
1029
    for(x=0; x<width2; x++){
1030
        temp[x   ]= b[2*x    ];
1031
        temp[x+w2]= b[2*x + 1];
1032
    }
1033
    if(width&1)
1034
        temp[x   ]= b[2*x    ];
1035
#if 0
1036
    {
1037
    int A1,A2,A3,A4;
1038
    A2= temp[1       ];
1039
    A4= temp[0       ];
1040
    A1= temp[0+width2];
1041
    A1 -= (A2 + A4)>>1;
1042
    A4 += (A1 + 1)>>1;
1043
    b[0+width2] = A1;
1044
    b[0       ] = A4;
1045
    for(x=1; x+1<width2; x+=2){
1046
        A3= temp[x+width2];
1047
        A4= temp[x+1     ];
1048
        A3 -= (A2 + A4)>>1;
1049
        A2 += (A1 + A3 + 2)>>2;
1050
        b[x+width2] = A3;
1051
        b[x       ] = A2;
1052

1053
        A1= temp[x+1+width2];
1054
        A2= temp[x+2       ];
1055
        A1 -= (A2 + A4)>>1;
1056
        A4 += (A1 + A3 + 2)>>2;
1057
        b[x+1+width2] = A1;
1058
        b[x+1       ] = A4;
1059
    }
1060
    A3= temp[width-1];
1061
    A3 -= A2;
1062
    A2 += (A1 + A3 + 2)>>2;
1063
    b[width -1] = A3;
1064
    b[width2-1] = A2;
1065
    }
1066
#else
1067
    lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1068
    lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1069
#endif
1070
}
1071

    
1072
static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1073
    int i;
1074

    
1075
    for(i=0; i<width; i++){
1076
        b1[i] -= (b0[i] + b2[i])>>1;
1077
    }
1078
}
1079

    
1080
static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1081
    int i;
1082

    
1083
    for(i=0; i<width; i++){
1084
        b1[i] += (b0[i] + b2[i] + 2)>>2;
1085
    }
1086
}
1087

    
1088
static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1089
    int y;
1090
    DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1091
    DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1092

    
1093
    for(y=-2; y<height; y+=2){
1094
        DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1095
        DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1096

    
1097
{START_TIMER
1098
        if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1099
        if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1100
STOP_TIMER("horizontal_decompose53i")}
1101

    
1102
{START_TIMER
1103
        if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1104
        if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1105
STOP_TIMER("vertical_decompose53i*")}
1106

    
1107
        b0=b2;
1108
        b1=b3;
1109
    }
1110
}
1111

    
1112
static void horizontal_decompose97i(DWTELEM *b, int width){
1113
    DWTELEM temp[width];
1114
    const int w2= (width+1)>>1;
1115

    
1116
    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1117
    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1118
    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1119
    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1120
}
1121

    
1122

    
1123
static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1124
    int i;
1125

    
1126
    for(i=0; i<width; i++){
1127
        b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1128
    }
1129
}
1130

    
1131
static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1132
    int i;
1133

    
1134
    for(i=0; i<width; i++){
1135
#ifdef lift5
1136
        b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1137
#else
1138
        int r= 3*(b0[i] + b2[i]);
1139
        r+= r>>4;
1140
        r+= r>>8;
1141
        b1[i] += (r+W_CO)>>W_CS;
1142
#endif
1143
    }
1144
}
1145

    
1146
static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1147
    int i;
1148

    
1149
    for(i=0; i<width; i++){
1150
#ifdef liftS
1151
        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1152
#else
1153
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1154
#endif
1155
    }
1156
}
1157

    
1158
static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1159
    int i;
1160

    
1161
    for(i=0; i<width; i++){
1162
        b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1163
    }
1164
}
1165

    
1166
static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1167
    int y;
1168
    DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1169
    DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1170
    DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1171
    DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1172

    
1173
    for(y=-4; y<height; y+=2){
1174
        DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1175
        DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1176

    
1177
{START_TIMER
1178
        if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1179
        if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1180
if(width>400){
1181
STOP_TIMER("horizontal_decompose97i")
1182
}}
1183

    
1184
{START_TIMER
1185
        if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1186
        if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1187
        if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1188
        if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1189

    
1190
if(width>400){
1191
STOP_TIMER("vertical_decompose97i")
1192
}}
1193

    
1194
        b0=b2;
1195
        b1=b3;
1196
        b2=b4;
1197
        b3=b5;
1198
    }
1199
}
1200

    
1201
void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1202
    int level;
1203

    
1204
    for(level=0; level<decomposition_count; level++){
1205
        switch(type){
1206
        case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1207
        case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1208
        case DWT_X: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1209
        }
1210
    }
1211
}
1212

    
1213
static void horizontal_compose53i(DWTELEM *b, int width){
1214
    DWTELEM temp[width];
1215
    const int width2= width>>1;
1216
    const int w2= (width+1)>>1;
1217
    int x;
1218

    
1219
#if 0
1220
    int A1,A2,A3,A4;
1221
    A2= temp[1       ];
1222
    A4= temp[0       ];
1223
    A1= temp[0+width2];
1224
    A1 -= (A2 + A4)>>1;
1225
    A4 += (A1 + 1)>>1;
1226
    b[0+width2] = A1;
1227
    b[0       ] = A4;
1228
    for(x=1; x+1<width2; x+=2){
1229
        A3= temp[x+width2];
1230
        A4= temp[x+1     ];
1231
        A3 -= (A2 + A4)>>1;
1232
        A2 += (A1 + A3 + 2)>>2;
1233
        b[x+width2] = A3;
1234
        b[x       ] = A2;
1235

1236
        A1= temp[x+1+width2];
1237
        A2= temp[x+2       ];
1238
        A1 -= (A2 + A4)>>1;
1239
        A4 += (A1 + A3 + 2)>>2;
1240
        b[x+1+width2] = A1;
1241
        b[x+1       ] = A4;
1242
    }
1243
    A3= temp[width-1];
1244
    A3 -= A2;
1245
    A2 += (A1 + A3 + 2)>>2;
1246
    b[width -1] = A3;
1247
    b[width2-1] = A2;
1248
#else
1249
    lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1250
    lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1251
#endif
1252
    for(x=0; x<width2; x++){
1253
        b[2*x    ]= temp[x   ];
1254
        b[2*x + 1]= temp[x+w2];
1255
    }
1256
    if(width&1)
1257
        b[2*x    ]= temp[x   ];
1258
}
1259

    
1260
static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1261
    int i;
1262

    
1263
    for(i=0; i<width; i++){
1264
        b1[i] += (b0[i] + b2[i])>>1;
1265
    }
1266
}
1267

    
1268
static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1269
    int i;
1270

    
1271
    for(i=0; i<width; i++){
1272
        b1[i] -= (b0[i] + b2[i] + 2)>>2;
1273
    }
1274
}
1275

    
1276
static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1277
    cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1278
    cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1279
    cs->y = -1;
1280
}
1281

    
1282
static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1283
    cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1284
    cs->b1 = buffer + mirror(-1  , height-1)*stride;
1285
    cs->y = -1;
1286
}
1287

    
1288
static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1289
    int y= cs->y;
1290

    
1291
    DWTELEM *b0= cs->b0;
1292
    DWTELEM *b1= cs->b1;
1293
    DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1294
    DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1295

    
1296
{START_TIMER
1297
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1298
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1299
STOP_TIMER("vertical_compose53i*")}
1300

    
1301
{START_TIMER
1302
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1303
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1304
STOP_TIMER("horizontal_compose53i")}
1305

    
1306
    cs->b0 = b2;
1307
    cs->b1 = b3;
1308
    cs->y += 2;
1309
}
1310

    
1311
static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1312
    int y= cs->y;
1313
    DWTELEM *b0= cs->b0;
1314
    DWTELEM *b1= cs->b1;
1315
    DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1316
    DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1317

    
1318
{START_TIMER
1319
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1320
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1321
STOP_TIMER("vertical_compose53i*")}
1322

    
1323
{START_TIMER
1324
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1325
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1326
STOP_TIMER("horizontal_compose53i")}
1327

    
1328
    cs->b0 = b2;
1329
    cs->b1 = b3;
1330
    cs->y += 2;
1331
}
1332

    
1333
static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1334
    dwt_compose_t cs;
1335
    spatial_compose53i_init(&cs, buffer, height, stride);
1336
    while(cs.y <= height)
1337
        spatial_compose53i_dy(&cs, buffer, width, height, stride);
1338
}
1339

    
1340

    
1341
void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1342
    DWTELEM temp[width];
1343
    const int w2= (width+1)>>1;
1344

    
1345
    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1346
    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1347
    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1348
    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1349
}
1350

    
1351
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1352
    int i;
1353

    
1354
    for(i=0; i<width; i++){
1355
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1356
    }
1357
}
1358

    
1359
static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1360
    int i;
1361

    
1362
    for(i=0; i<width; i++){
1363
#ifdef lift5
1364
        b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1365
#else
1366
        int r= 3*(b0[i] + b2[i]);
1367
        r+= r>>4;
1368
        r+= r>>8;
1369
        b1[i] -= (r+W_CO)>>W_CS;
1370
#endif
1371
    }
1372
}
1373

    
1374
static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1375
    int i;
1376

    
1377
    for(i=0; i<width; i++){
1378
#ifdef liftS
1379
        b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1380
#else
1381
        b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1382
#endif
1383
    }
1384
}
1385

    
1386
static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1387
    int i;
1388

    
1389
    for(i=0; i<width; i++){
1390
        b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1391
    }
1392
}
1393

    
1394
void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1395
    int i;
1396

    
1397
    for(i=0; i<width; i++){
1398
#ifndef lift5
1399
        int r;
1400
#endif
1401
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1402
#ifdef lift5
1403
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1404
#else
1405
        r= 3*(b2[i] + b4[i]);
1406
        r+= r>>4;
1407
        r+= r>>8;
1408
        b3[i] -= (r+W_CO)>>W_CS;
1409
#endif
1410
#ifdef liftS
1411
        b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1412
#else
1413
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1414
#endif
1415
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1416
    }
1417
}
1418

    
1419
static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1420
    cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1421
    cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1422
    cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1423
    cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1424
    cs->y = -3;
1425
}
1426

    
1427
static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1428
    cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1429
    cs->b1 = buffer + mirror(-3  , height-1)*stride;
1430
    cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1431
    cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1432
    cs->y = -3;
1433
}
1434

    
1435
static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1436
    int y = cs->y;
1437

    
1438
    DWTELEM *b0= cs->b0;
1439
    DWTELEM *b1= cs->b1;
1440
    DWTELEM *b2= cs->b2;
1441
    DWTELEM *b3= cs->b3;
1442
    DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1443
    DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1444

    
1445
{START_TIMER
1446
    if(y>0 && y+4<height){
1447
        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1448
    }else{
1449
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1450
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1451
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1452
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1453
    }
1454
if(width>400){
1455
STOP_TIMER("vertical_compose97i")}}
1456

    
1457
{START_TIMER
1458
        if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1459
        if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1460
if(width>400 && y+0<(unsigned)height){
1461
STOP_TIMER("horizontal_compose97i")}}
1462

    
1463
    cs->b0=b2;
1464
    cs->b1=b3;
1465
    cs->b2=b4;
1466
    cs->b3=b5;
1467
    cs->y += 2;
1468
}
1469

    
1470
static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1471
    int y = cs->y;
1472
    DWTELEM *b0= cs->b0;
1473
    DWTELEM *b1= cs->b1;
1474
    DWTELEM *b2= cs->b2;
1475
    DWTELEM *b3= cs->b3;
1476
    DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1477
    DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1478

    
1479
{START_TIMER
1480
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1481
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1482
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1483
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1484
if(width>400){
1485
STOP_TIMER("vertical_compose97i")}}
1486

    
1487
{START_TIMER
1488
        if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1489
        if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1490
if(width>400 && b0 <= b2){
1491
STOP_TIMER("horizontal_compose97i")}}
1492

    
1493
    cs->b0=b2;
1494
    cs->b1=b3;
1495
    cs->b2=b4;
1496
    cs->b3=b5;
1497
    cs->y += 2;
1498
}
1499

    
1500
static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1501
    dwt_compose_t cs;
1502
    spatial_compose97i_init(&cs, buffer, height, stride);
1503
    while(cs.y <= height)
1504
        spatial_compose97i_dy(&cs, buffer, width, height, stride);
1505
}
1506

    
1507
static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1508
    int level;
1509
    for(level=decomposition_count-1; level>=0; level--){
1510
        switch(type){
1511
        case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1512
        case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1513
        /* not slicified yet */
1514
        case DWT_X: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1515
          av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1516
        }
1517
    }
1518
}
1519

    
1520
static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1521
    int level;
1522
    for(level=decomposition_count-1; level>=0; level--){
1523
        switch(type){
1524
        case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1525
        case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1526
        /* not slicified yet */
1527
        case DWT_X: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1528
        }
1529
    }
1530
}
1531

    
1532
static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1533
    const int support = type==1 ? 3 : 5;
1534
    int level;
1535
    if(type==2) return;
1536

    
1537
    for(level=decomposition_count-1; level>=0; level--){
1538
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1539
            switch(type){
1540
            case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1541
                    break;
1542
            case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1543
                    break;
1544
            case DWT_X: break;
1545
            }
1546
        }
1547
    }
1548
}
1549

    
1550
static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1551
    const int support = type==1 ? 3 : 5;
1552
    int level;
1553
    if(type==2) return;
1554

    
1555
    for(level=decomposition_count-1; level>=0; level--){
1556
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1557
            switch(type){
1558
            case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1559
                    break;
1560
            case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1561
                    break;
1562
            case DWT_X: break;
1563
            }
1564
        }
1565
    }
1566
}
1567

    
1568
static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1569
    if(type==2){
1570
        int level;
1571
        for(level=decomposition_count-1; level>=0; level--)
1572
            spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1573
    }else{
1574
        dwt_compose_t cs[MAX_DECOMPOSITIONS];
1575
        int y;
1576
        ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1577
        for(y=0; y<height; y+=4)
1578
            ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1579
    }
1580
}
1581

    
1582
static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1583
    const int w= b->width;
1584
    const int h= b->height;
1585
    int x, y;
1586

    
1587
    if(1){
1588
        int run=0;
1589
        int runs[w*h];
1590
        int run_index=0;
1591
        int max_index;
1592

    
1593
        for(y=0; y<h; y++){
1594
            for(x=0; x<w; x++){
1595
                int v, p=0;
1596
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1597
                v= src[x + y*stride];
1598

    
1599
                if(y){
1600
                    t= src[x + (y-1)*stride];
1601
                    if(x){
1602
                        lt= src[x - 1 + (y-1)*stride];
1603
                    }
1604
                    if(x + 1 < w){
1605
                        rt= src[x + 1 + (y-1)*stride];
1606
                    }
1607
                }
1608
                if(x){
1609
                    l= src[x - 1 + y*stride];
1610
                    /*if(x > 1){
1611
                        if(orientation==1) ll= src[y + (x-2)*stride];
1612
                        else               ll= src[x - 2 + y*stride];
1613
                    }*/
1614
                }
1615
                if(parent){
1616
                    int px= x>>1;
1617
                    int py= y>>1;
1618
                    if(px<b->parent->width && py<b->parent->height)
1619
                        p= parent[px + py*2*stride];
1620
                }
1621
                if(!(/*ll|*/l|lt|t|rt|p)){
1622
                    if(v){
1623
                        runs[run_index++]= run;
1624
                        run=0;
1625
                    }else{
1626
                        run++;
1627
                    }
1628
                }
1629
            }
1630
        }
1631
        max_index= run_index;
1632
        runs[run_index++]= run;
1633
        run_index=0;
1634
        run= runs[run_index++];
1635

    
1636
        put_symbol2(&s->c, b->state[30], max_index, 0);
1637
        if(run_index <= max_index)
1638
            put_symbol2(&s->c, b->state[1], run, 3);
1639

    
1640
        for(y=0; y<h; y++){
1641
            if(s->c.bytestream_end - s->c.bytestream < w*40){
1642
                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1643
                return -1;
1644
            }
1645
            for(x=0; x<w; x++){
1646
                int v, p=0;
1647
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1648
                v= src[x + y*stride];
1649

    
1650
                if(y){
1651
                    t= src[x + (y-1)*stride];
1652
                    if(x){
1653
                        lt= src[x - 1 + (y-1)*stride];
1654
                    }
1655
                    if(x + 1 < w){
1656
                        rt= src[x + 1 + (y-1)*stride];
1657
                    }
1658
                }
1659
                if(x){
1660
                    l= src[x - 1 + y*stride];
1661
                    /*if(x > 1){
1662
                        if(orientation==1) ll= src[y + (x-2)*stride];
1663
                        else               ll= src[x - 2 + y*stride];
1664
                    }*/
1665
                }
1666
                if(parent){
1667
                    int px= x>>1;
1668
                    int py= y>>1;
1669
                    if(px<b->parent->width && py<b->parent->height)
1670
                        p= parent[px + py*2*stride];
1671
                }
1672
                if(/*ll|*/l|lt|t|rt|p){
1673
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1674

    
1675
                    put_rac(&s->c, &b->state[0][context], !!v);
1676
                }else{
1677
                    if(!run){
1678
                        run= runs[run_index++];
1679

    
1680
                        if(run_index <= max_index)
1681
                            put_symbol2(&s->c, b->state[1], run, 3);
1682
                        assert(v);
1683
                    }else{
1684
                        run--;
1685
                        assert(!v);
1686
                    }
1687
                }
1688
                if(v){
1689
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1690
                    int l2= 2*FFABS(l) + (l<0);
1691
                    int t2= 2*FFABS(t) + (t<0);
1692

    
1693
                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1694
                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1695
                }
1696
            }
1697
        }
1698
    }
1699
    return 0;
1700
}
1701

    
1702
static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1703
//    encode_subband_qtree(s, b, src, parent, stride, orientation);
1704
//    encode_subband_z0run(s, b, src, parent, stride, orientation);
1705
    return encode_subband_c0run(s, b, src, parent, stride, orientation);
1706
//    encode_subband_dzr(s, b, src, parent, stride, orientation);
1707
}
1708

    
1709
static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1710
    const int w= b->width;
1711
    const int h= b->height;
1712
    int x,y;
1713

    
1714
    if(1){
1715
        int run, runs;
1716
        x_and_coeff *xc= b->x_coeff;
1717
        x_and_coeff *prev_xc= NULL;
1718
        x_and_coeff *prev2_xc= xc;
1719
        x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1720
        x_and_coeff *prev_parent_xc= parent_xc;
1721

    
1722
        runs= get_symbol2(&s->c, b->state[30], 0);
1723
        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1724
        else           run= INT_MAX;
1725

    
1726
        for(y=0; y<h; y++){
1727
            int v=0;
1728
            int lt=0, t=0, rt=0;
1729

    
1730
            if(y && prev_xc->x == 0){
1731
                rt= prev_xc->coeff;
1732
            }
1733
            for(x=0; x<w; x++){
1734
                int p=0;
1735
                const int l= v;
1736

    
1737
                lt= t; t= rt;
1738

    
1739
                if(y){
1740
                    if(prev_xc->x <= x)
1741
                        prev_xc++;
1742
                    if(prev_xc->x == x + 1)
1743
                        rt= prev_xc->coeff;
1744
                    else
1745
                        rt=0;
1746
                }
1747
                if(parent_xc){
1748
                    if(x>>1 > parent_xc->x){
1749
                        parent_xc++;
1750
                    }
1751
                    if(x>>1 == parent_xc->x){
1752
                        p= parent_xc->coeff;
1753
                    }
1754
                }
1755
                if(/*ll|*/l|lt|t|rt|p){
1756
                    int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1757

    
1758
                    v=get_rac(&s->c, &b->state[0][context]);
1759
                    if(v){
1760
                        v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1761
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1762

    
1763
                        xc->x=x;
1764
                        (xc++)->coeff= v;
1765
                    }
1766
                }else{
1767
                    if(!run){
1768
                        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1769
                        else           run= INT_MAX;
1770
                        v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1771
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1772

    
1773
                        xc->x=x;
1774
                        (xc++)->coeff= v;
1775
                    }else{
1776
                        int max_run;
1777
                        run--;
1778
                        v=0;
1779

    
1780
                        if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1781
                        else  max_run= FFMIN(run, w-x-1);
1782
                        if(parent_xc)
1783
                            max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1784
                        x+= max_run;
1785
                        run-= max_run;
1786
                    }
1787
                }
1788
            }
1789
            (xc++)->x= w+1; //end marker
1790
            prev_xc= prev2_xc;
1791
            prev2_xc= xc;
1792

    
1793
            if(parent_xc){
1794
                if(y&1){
1795
                    while(parent_xc->x != parent->width+1)
1796
                        parent_xc++;
1797
                    parent_xc++;
1798
                    prev_parent_xc= parent_xc;
1799
                }else{
1800
                    parent_xc= prev_parent_xc;
1801
                }
1802
            }
1803
        }
1804

    
1805
        (xc++)->x= w+1; //end marker
1806
    }
1807
}
1808

    
1809
static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1810
    const int w= b->width;
1811
    int y;
1812
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1813
    int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1814
    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1815
    int new_index = 0;
1816

    
1817
    START_TIMER
1818

    
1819
    if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1820
        qadd= 0;
1821
        qmul= 1<<QEXPSHIFT;
1822
    }
1823

    
1824
    /* If we are on the second or later slice, restore our index. */
1825
    if (start_y != 0)
1826
        new_index = save_state[0];
1827

    
1828

    
1829
    for(y=start_y; y<h; y++){
1830
        int x = 0;
1831
        int v;
1832
        DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1833
        memset(line, 0, b->width*sizeof(DWTELEM));
1834
        v = b->x_coeff[new_index].coeff;
1835
        x = b->x_coeff[new_index++].x;
1836
        while(x < w)
1837
        {
1838
            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1839
            register int u= -(v&1);
1840
            line[x] = (t^u) - u;
1841

    
1842
            v = b->x_coeff[new_index].coeff;
1843
            x = b->x_coeff[new_index++].x;
1844
        }
1845
    }
1846
    if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1847
        STOP_TIMER("decode_subband")
1848
    }
1849

    
1850
    /* Save our variables for the next slice. */
1851
    save_state[0] = new_index;
1852

    
1853
    return;
1854
}
1855

    
1856
static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1857
    int plane_index, level, orientation;
1858

    
1859
    for(plane_index=0; plane_index<3; plane_index++){
1860
        for(level=0; level<s->spatial_decomposition_count; level++){
1861
            for(orientation=level ? 1:0; orientation<4; orientation++){
1862
                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1863
            }
1864
        }
1865
    }
1866
    memset(s->header_state, MID_STATE, sizeof(s->header_state));
1867
    memset(s->block_state, MID_STATE, sizeof(s->block_state));
1868
}
1869

    
1870
static int alloc_blocks(SnowContext *s){
1871
    int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1872
    int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1873

    
1874
    s->b_width = w;
1875
    s->b_height= h;
1876

    
1877
    s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1878
    return 0;
1879
}
1880

    
1881
static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1882
    uint8_t *bytestream= d->bytestream;
1883
    uint8_t *bytestream_start= d->bytestream_start;
1884
    *d= *s;
1885
    d->bytestream= bytestream;
1886
    d->bytestream_start= bytestream_start;
1887
}
1888

    
1889
//near copy & paste from dsputil, FIXME
1890
static int pix_sum(uint8_t * pix, int line_size, int w)
1891
{
1892
    int s, i, j;
1893

    
1894
    s = 0;
1895
    for (i = 0; i < w; i++) {
1896
        for (j = 0; j < w; j++) {
1897
            s += pix[0];
1898
            pix ++;
1899
        }
1900
        pix += line_size - w;
1901
    }
1902
    return s;
1903
}
1904

    
1905
//near copy & paste from dsputil, FIXME
1906
static int pix_norm1(uint8_t * pix, int line_size, int w)
1907
{
1908
    int s, i, j;
1909
    uint32_t *sq = ff_squareTbl + 256;
1910

    
1911
    s = 0;
1912
    for (i = 0; i < w; i++) {
1913
        for (j = 0; j < w; j ++) {
1914
            s += sq[pix[0]];
1915
            pix ++;
1916
        }
1917
        pix += line_size - w;
1918
    }
1919
    return s;
1920
}
1921

    
1922
static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1923
    const int w= s->b_width << s->block_max_depth;
1924
    const int rem_depth= s->block_max_depth - level;
1925
    const int index= (x + y*w) << rem_depth;
1926
    const int block_w= 1<<rem_depth;
1927
    BlockNode block;
1928
    int i,j;
1929

    
1930
    block.color[0]= l;
1931
    block.color[1]= cb;
1932
    block.color[2]= cr;
1933
    block.mx= mx;
1934
    block.my= my;
1935
    block.ref= ref;
1936
    block.type= type;
1937
    block.level= level;
1938

    
1939
    for(j=0; j<block_w; j++){
1940
        for(i=0; i<block_w; i++){
1941
            s->block[index + i + j*w]= block;
1942
        }
1943
    }
1944
}
1945

    
1946
static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1947
    const int offset[3]= {
1948
          y*c->  stride + x,
1949
        ((y*c->uvstride + x)>>1),
1950
        ((y*c->uvstride + x)>>1),
1951
    };
1952
    int i;
1953
    for(i=0; i<3; i++){
1954
        c->src[0][i]= src [i];
1955
        c->ref[0][i]= ref [i] + offset[i];
1956
    }
1957
    assert(!ref_index);
1958
}
1959

    
1960
static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1961
                           const BlockNode *left, const BlockNode *top, const BlockNode *tr){
1962
    if(s->ref_frames == 1){
1963
        *mx = mid_pred(left->mx, top->mx, tr->mx);
1964
        *my = mid_pred(left->my, top->my, tr->my);
1965
    }else{
1966
        const int *scale = scale_mv_ref[ref];
1967
        *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
1968
                       (top ->mx * scale[top ->ref] + 128) >>8,
1969
                       (tr  ->mx * scale[tr  ->ref] + 128) >>8);
1970
        *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
1971
                       (top ->my * scale[top ->ref] + 128) >>8,
1972
                       (tr  ->my * scale[tr  ->ref] + 128) >>8);
1973
    }
1974
}
1975

    
1976
//FIXME copy&paste
1977
#define P_LEFT P[1]
1978
#define P_TOP P[2]
1979
#define P_TOPRIGHT P[3]
1980
#define P_MEDIAN P[4]
1981
#define P_MV1 P[9]
1982
#define FLAG_QPEL   1 //must be 1
1983

    
1984
static int encode_q_branch(SnowContext *s, int level, int x, int y){
1985
    uint8_t p_buffer[1024];
1986
    uint8_t i_buffer[1024];
1987
    uint8_t p_state[sizeof(s->block_state)];
1988
    uint8_t i_state[sizeof(s->block_state)];
1989
    RangeCoder pc, ic;
1990
    uint8_t *pbbak= s->c.bytestream;
1991
    uint8_t *pbbak_start= s->c.bytestream_start;
1992
    int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
1993
    const int w= s->b_width  << s->block_max_depth;
1994
    const int h= s->b_height << s->block_max_depth;
1995
    const int rem_depth= s->block_max_depth - level;
1996
    const int index= (x + y*w) << rem_depth;
1997
    const int block_w= 1<<(LOG2_MB_SIZE - level);
1998
    int trx= (x+1)<<rem_depth;
1999
    int try= (y+1)<<rem_depth;
2000
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2001
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2002
    const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
2003
    const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2004
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2005
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2006
    int pl = left->color[0];
2007
    int pcb= left->color[1];
2008
    int pcr= left->color[2];
2009
    int pmx, pmy;
2010
    int mx=0, my=0;
2011
    int l,cr,cb;
2012
    const int stride= s->current_picture.linesize[0];
2013
    const int uvstride= s->current_picture.linesize[1];
2014
    uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2015
                                s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2016
                                s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2017
    int P[10][2];
2018
    int16_t last_mv[3][2];
2019
    int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2020
    const int shift= 1+qpel;
2021
    MotionEstContext *c= &s->m.me;
2022
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2023
    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2024
    int my_context= av_log2(2*FFABS(left->my - top->my));
2025
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2026
    int ref, best_ref, ref_score, ref_mx, ref_my;
2027

    
2028
    assert(sizeof(s->block_state) >= 256);
2029
    if(s->keyframe){
2030
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2031
        return 0;
2032
    }
2033

    
2034
//    clip predictors / edge ?
2035

    
2036
    P_LEFT[0]= left->mx;
2037
    P_LEFT[1]= left->my;
2038
    P_TOP [0]= top->mx;
2039
    P_TOP [1]= top->my;
2040
    P_TOPRIGHT[0]= tr->mx;
2041
    P_TOPRIGHT[1]= tr->my;
2042

    
2043
    last_mv[0][0]= s->block[index].mx;
2044
    last_mv[0][1]= s->block[index].my;
2045
    last_mv[1][0]= right->mx;
2046
    last_mv[1][1]= right->my;
2047
    last_mv[2][0]= bottom->mx;
2048
    last_mv[2][1]= bottom->my;
2049

    
2050
    s->m.mb_stride=2;
2051
    s->m.mb_x=
2052
    s->m.mb_y= 0;
2053
    c->skip= 0;
2054

    
2055
    assert(c->  stride ==   stride);
2056
    assert(c->uvstride == uvstride);
2057

    
2058
    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2059
    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2060
    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2061
    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2062

    
2063
    c->xmin = - x*block_w - 16+2;
2064
    c->ymin = - y*block_w - 16+2;
2065
    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2066
    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2067

    
2068
    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2069
    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2070
    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2071
    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2072
    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2073
    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2074
    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2075

    
2076
    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2077
    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2078

    
2079
    if (!y) {
2080
        c->pred_x= P_LEFT[0];
2081
        c->pred_y= P_LEFT[1];
2082
    } else {
2083
        c->pred_x = P_MEDIAN[0];
2084
        c->pred_y = P_MEDIAN[1];
2085
    }
2086

    
2087
    score= INT_MAX;
2088
    best_ref= 0;
2089
    for(ref=0; ref<s->ref_frames; ref++){
2090
        init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2091

    
2092
        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2093
                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2094

    
2095
        assert(ref_mx >= c->xmin);
2096
        assert(ref_mx <= c->xmax);
2097
        assert(ref_my >= c->ymin);
2098
        assert(ref_my <= c->ymax);
2099

    
2100
        ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2101
        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2102
        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2103
        if(s->ref_mvs[ref]){
2104
            s->ref_mvs[ref][index][0]= ref_mx;
2105
            s->ref_mvs[ref][index][1]= ref_my;
2106
            s->ref_scores[ref][index]= ref_score;
2107
        }
2108
        if(score > ref_score){
2109
            score= ref_score;
2110
            best_ref= ref;
2111
            mx= ref_mx;
2112
            my= ref_my;
2113
        }
2114
    }
2115
    //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
2116

    
2117
  //  subpel search
2118
    base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
2119
    pc= s->c;
2120
    pc.bytestream_start=
2121
    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2122
    memcpy(p_state, s->block_state, sizeof(s->block_state));
2123

    
2124
    if(level!=s->block_max_depth)
2125
        put_rac(&pc, &p_state[4 + s_context], 1);
2126
    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2127
    if(s->ref_frames > 1)
2128
        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2129
    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
2130
    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2131
    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2132
    p_len= pc.bytestream - pc.bytestream_start;
2133
    score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
2134

    
2135
    block_s= block_w*block_w;
2136
    sum = pix_sum(current_data[0], stride, block_w);
2137
    l= (sum + block_s/2)/block_s;
2138
    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2139

    
2140
    block_s= block_w*block_w>>2;
2141
    sum = pix_sum(current_data[1], uvstride, block_w>>1);
2142
    cb= (sum + block_s/2)/block_s;
2143
//    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2144
    sum = pix_sum(current_data[2], uvstride, block_w>>1);
2145
    cr= (sum + block_s/2)/block_s;
2146
//    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2147

    
2148
    ic= s->c;
2149
    ic.bytestream_start=
2150
    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2151
    memcpy(i_state, s->block_state, sizeof(s->block_state));
2152
    if(level!=s->block_max_depth)
2153
        put_rac(&ic, &i_state[4 + s_context], 1);
2154
    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2155
    put_symbol(&ic, &i_state[32],  l-pl , 1);
2156
    put_symbol(&ic, &i_state[64], cb-pcb, 1);
2157
    put_symbol(&ic, &i_state[96], cr-pcr, 1);
2158
    i_len= ic.bytestream - ic.bytestream_start;
2159
    iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
2160

    
2161
//    assert(score==256*256*256*64-1);
2162
    assert(iscore < 255*255*256 + s->lambda2*10);
2163
    assert(iscore >= 0);
2164
    assert(l>=0 && l<=255);
2165
    assert(pl>=0 && pl<=255);
2166

    
2167
    if(level==0){
2168
        int varc= iscore >> 8;
2169
        int vard= score >> 8;
2170
        if (vard <= 64 || vard < varc)
2171
            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2172
        else
2173
            c->scene_change_score+= s->m.qscale;
2174
    }
2175

    
2176
    if(level!=s->block_max_depth){
2177
        put_rac(&s->c, &s->block_state[4 + s_context], 0);
2178
        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2179
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2180
        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2181
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2182
        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2183

    
2184
        if(score2 < score && score2 < iscore)
2185
            return score2;
2186
    }
2187

    
2188
    if(iscore < score){
2189
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2190
        memcpy(pbbak, i_buffer, i_len);
2191
        s->c= ic;
2192
        s->c.bytestream_start= pbbak_start;
2193
        s->c.bytestream= pbbak + i_len;
2194
        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2195
        memcpy(s->block_state, i_state, sizeof(s->block_state));
2196
        return iscore;
2197
    }else{
2198
        memcpy(pbbak, p_buffer, p_len);
2199
        s->c= pc;
2200
        s->c.bytestream_start= pbbak_start;
2201
        s->c.bytestream= pbbak + p_len;
2202
        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2203
        memcpy(s->block_state, p_state, sizeof(s->block_state));
2204
        return score;
2205
    }
2206
}
2207

    
2208
static av_always_inline int same_block(BlockNode *a, BlockNode *b){
2209
    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2210
        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2211
    }else{
2212
        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2213
    }
2214
}
2215

    
2216
static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2217
    const int w= s->b_width  << s->block_max_depth;
2218
    const int rem_depth= s->block_max_depth - level;
2219
    const int index= (x + y*w) << rem_depth;
2220
    int trx= (x+1)<<rem_depth;
2221
    BlockNode *b= &s->block[index];
2222
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2223
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2224
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2225
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2226
    int pl = left->color[0];
2227
    int pcb= left->color[1];
2228
    int pcr= left->color[2];
2229
    int pmx, pmy;
2230
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2231
    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2232
    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2233
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2234

    
2235
    if(s->keyframe){
2236
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2237
        return;
2238
    }
2239

    
2240
    if(level!=s->block_max_depth){
2241
        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2242
            put_rac(&s->c, &s->block_state[4 + s_context], 1);
2243
        }else{
2244
            put_rac(&s->c, &s->block_state[4 + s_context], 0);
2245
            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2246
            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2247
            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2248
            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2249
            return;
2250
        }
2251
    }
2252
    if(b->type & BLOCK_INTRA){
2253
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2254
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2255
        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2256
        put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2257
        put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2258
        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2259
    }else{
2260
        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2261
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2262
        if(s->ref_frames > 1)
2263
            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2264
        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2265
        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2266
        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2267
    }
2268
}
2269

    
2270
static void decode_q_branch(SnowContext *s, int level, int x, int y){
2271
    const int w= s->b_width << s->block_max_depth;
2272
    const int rem_depth= s->block_max_depth - level;
2273
    const int index= (x + y*w) << rem_depth;
2274
    int trx= (x+1)<<rem_depth;
2275
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2276
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2277
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2278
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2279
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2280

    
2281
    if(s->keyframe){
2282
        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2283
        return;
2284
    }
2285

    
2286
    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2287
        int type, mx, my;
2288
        int l = left->color[0];
2289
        int cb= left->color[1];
2290
        int cr= left->color[2];
2291
        int ref = 0;
2292
        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2293
        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2294
        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2295

    
2296
        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2297

    
2298
        if(type){
2299
            pred_mv(s, &mx, &my, 0, left, top, tr);
2300
            l += get_symbol(&s->c, &s->block_state[32], 1);
2301
            cb+= get_symbol(&s->c, &s->block_state[64], 1);
2302
            cr+= get_symbol(&s->c, &s->block_state[96], 1);
2303
        }else{
2304
            if(s->ref_frames > 1)
2305
                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2306
            pred_mv(s, &mx, &my, ref, left, top, tr);
2307
            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2308
            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2309
        }
2310
        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2311
    }else{
2312
        decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2313
        decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2314
        decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2315
        decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2316
    }
2317
}
2318

    
2319
static void encode_blocks(SnowContext *s, int search){
2320
    int x, y;
2321
    int w= s->b_width;
2322
    int h= s->b_height;
2323

    
2324
    if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2325
        iterative_me(s);
2326

    
2327
    for(y=0; y<h; y++){
2328
        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2329
            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2330
            return;
2331
        }
2332
        for(x=0; x<w; x++){
2333
            if(s->avctx->me_method == ME_ITER || !search)
2334
                encode_q_branch2(s, 0, x, y);
2335
            else
2336
                encode_q_branch (s, 0, x, y);
2337
        }
2338
    }
2339
}
2340

    
2341
static void decode_blocks(SnowContext *s){
2342
    int x, y;
2343
    int w= s->b_width;
2344
    int h= s->b_height;
2345

    
2346
    for(y=0; y<h; y++){
2347
        for(x=0; x<w; x++){
2348
            decode_q_branch(s, 0, x, y);
2349
        }
2350
    }
2351
}
2352

    
2353
static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2354
    int x, y;
2355
START_TIMER
2356
    for(y=0; y < b_h+5; y++){
2357
        for(x=0; x < b_w; x++){
2358
            int a0= src[x    ];
2359
            int a1= src[x + 1];
2360
            int a2= src[x + 2];
2361
            int a3= src[x + 3];
2362
            int a4= src[x + 4];
2363
            int a5= src[x + 5];
2364
//            int am= 9*(a1+a2) - (a0+a3);
2365
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2366
//            int am= 18*(a2+a3) - 2*(a1+a4);
2367
//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2368
//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2369

    
2370
//            if(b_w==16) am= 8*(a1+a2);
2371

    
2372
            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2373
            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2374

    
2375
            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2376
            if(am&(~255)) am= ~(am>>31);
2377

    
2378
            tmp[x] = am;
2379

    
2380
/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2381
            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2382
            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2383
            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2384
        }
2385
        tmp += stride;
2386
        src += stride;
2387
    }
2388
    tmp -= (b_h+5)*stride;
2389

    
2390
    for(y=0; y < b_h; y++){
2391
        for(x=0; x < b_w; x++){
2392
            int a0= tmp[x + 0*stride];
2393
            int a1= tmp[x + 1*stride];
2394
            int a2= tmp[x + 2*stride];
2395
            int a3= tmp[x + 3*stride];
2396
            int a4= tmp[x + 4*stride];
2397
            int a5= tmp[x + 5*stride];
2398
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2399
//            int am= 18*(a2+a3) - 2*(a1+a4);
2400
/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2401
            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2402

    
2403
//            if(b_w==16) am= 8*(a1+a2);
2404

    
2405
            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2406
            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2407

    
2408
            if(am&(~255)) am= ~(am>>31);
2409

    
2410
            dst[x] = am;
2411
/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2412
            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2413
            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2414
            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2415
        }
2416
        dst += stride;
2417
        tmp += stride;
2418
    }
2419
STOP_TIMER("mc_block")
2420
}
2421

    
2422
#define mca(dx,dy,b_w)\
2423
static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2424
    uint8_t tmp[stride*(b_w+5)];\
2425
    assert(h==b_w);\
2426
    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2427
}
2428

    
2429
mca( 0, 0,16)
2430
mca( 8, 0,16)
2431
mca( 0, 8,16)
2432
mca( 8, 8,16)
2433
mca( 0, 0,8)
2434
mca( 8, 0,8)
2435
mca( 0, 8,8)
2436
mca( 8, 8,8)
2437

    
2438
static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2439
    if(block->type & BLOCK_INTRA){
2440
        int x, y;
2441
        const int color = block->color[plane_index];
2442
        const int color4= color*0x01010101;
2443
        if(b_w==32){
2444
            for(y=0; y < b_h; y++){
2445
                *(uint32_t*)&dst[0 + y*stride]= color4;
2446
                *(uint32_t*)&dst[4 + y*stride]= color4;
2447
                *(uint32_t*)&dst[8 + y*stride]= color4;
2448
                *(uint32_t*)&dst[12+ y*stride]= color4;
2449
                *(uint32_t*)&dst[16+ y*stride]= color4;
2450
                *(uint32_t*)&dst[20+ y*stride]= color4;
2451
                *(uint32_t*)&dst[24+ y*stride]= color4;
2452
                *(uint32_t*)&dst[28+ y*stride]= color4;
2453
            }
2454
        }else if(b_w==16){
2455
            for(y=0; y < b_h; y++){
2456
                *(uint32_t*)&dst[0 + y*stride]= color4;
2457
                *(uint32_t*)&dst[4 + y*stride]= color4;
2458
                *(uint32_t*)&dst[8 + y*stride]= color4;
2459
                *(uint32_t*)&dst[12+ y*stride]= color4;
2460
            }
2461
        }else if(b_w==8){
2462
            for(y=0; y < b_h; y++){
2463
                *(uint32_t*)&dst[0 + y*stride]= color4;
2464
                *(uint32_t*)&dst[4 + y*stride]= color4;
2465
            }
2466
        }else if(b_w==4){
2467
            for(y=0; y < b_h; y++){
2468
                *(uint32_t*)&dst[0 + y*stride]= color4;
2469
            }
2470
        }else{
2471
            for(y=0; y < b_h; y++){
2472
                for(x=0; x < b_w; x++){
2473
                    dst[x + y*stride]= color;
2474
                }
2475
            }
2476
        }
2477
    }else{
2478
        uint8_t *src= s->last_picture[block->ref].data[plane_index];
2479
        const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2480
        int mx= block->mx*scale;
2481
        int my= block->my*scale;
2482
        const int dx= mx&15;
2483
        const int dy= my&15;
2484
        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2485
        sx += (mx>>4) - 2;
2486
        sy += (my>>4) - 2;
2487
        src += sx + sy*stride;
2488
        if(   (unsigned)sx >= w - b_w - 4
2489
           || (unsigned)sy >= h - b_h - 4){
2490
            ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2491
            src= tmp + MB_SIZE;
2492
        }
2493
//        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2494
//        assert(!(b_w&(b_w-1)));
2495
        assert(b_w>1 && b_h>1);
2496
        assert(tab_index>=0 && tab_index<4 || b_w==32);
2497
        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2498
            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2499
        else if(b_w==32){
2500
            int y;
2501
            for(y=0; y<b_h; y+=16){
2502
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2503
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2504
            }
2505
        }else if(b_w==b_h)
2506
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2507
        else if(b_w==2*b_h){
2508
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2509
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2510
        }else{
2511
            assert(2*b_w==b_h);
2512
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2513
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2514
        }
2515
    }
2516
}
2517

    
2518
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2519
                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2520
    int y, x;
2521
    DWTELEM * dst;
2522
    for(y=0; y<b_h; y++){
2523
        //FIXME ugly missue of obmc_stride
2524
        const uint8_t *obmc1= obmc + y*obmc_stride;
2525
        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2526
        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2527
        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2528
        dst = slice_buffer_get_line(sb, src_y + y);
2529
        for(x=0; x<b_w; x++){
2530
            int v=   obmc1[x] * block[3][x + y*src_stride]
2531
                    +obmc2[x] * block[2][x + y*src_stride]
2532
                    +obmc3[x] * block[1][x + y*src_stride]
2533
                    +obmc4[x] * block[0][x + y*src_stride];
2534

    
2535
            v <<= 8 - LOG2_OBMC_MAX;
2536
            if(FRAC_BITS != 8){
2537
                v += 1<<(7 - FRAC_BITS);
2538
                v >>= 8 - FRAC_BITS;
2539
            }
2540
            if(add){
2541
                v += dst[x + src_x];
2542
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2543
                if(v&(~255)) v= ~(v>>31);
2544
                dst8[x + y*src_stride] = v;
2545
            }else{
2546
                dst[x + src_x] -= v;
2547
            }
2548
        }
2549
    }
2550
}
2551

    
2552
//FIXME name clenup (b_w, block_w, b_width stuff)
2553
static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2554
    const int b_width = s->b_width  << s->block_max_depth;
2555
    const int b_height= s->b_height << s->block_max_depth;
2556
    const int b_stride= b_width;
2557
    BlockNode *lt= &s->block[b_x + b_y*b_stride];
2558
    BlockNode *rt= lt+1;
2559
    BlockNode *lb= lt+b_stride;
2560
    BlockNode *rb= lb+1;
2561
    uint8_t *block[4];
2562
    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2563
    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2564
    uint8_t *ptmp;
2565
    int x,y;
2566

    
2567
    if(b_x<0){
2568
        lt= rt;
2569
        lb= rb;
2570
    }else if(b_x + 1 >= b_width){
2571
        rt= lt;
2572
        rb= lb;
2573
    }
2574
    if(b_y<0){
2575
        lt= lb;
2576
        rt= rb;
2577
    }else if(b_y + 1 >= b_height){
2578
        lb= lt;
2579
        rb= rt;
2580
    }
2581

    
2582
    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2583
        obmc -= src_x;
2584
        b_w += src_x;
2585
        if(!sliced && !offset_dst)
2586
            dst -= src_x;
2587
        src_x=0;
2588
    }else if(src_x + b_w > w){
2589
        b_w = w - src_x;
2590
    }
2591
    if(src_y<0){
2592
        obmc -= src_y*obmc_stride;
2593
        b_h += src_y;
2594
        if(!sliced && !offset_dst)
2595
            dst -= src_y*dst_stride;
2596
        src_y=0;
2597
    }else if(src_y + b_h> h){
2598
        b_h = h - src_y;
2599
    }
2600

    
2601
    if(b_w<=0 || b_h<=0) return;
2602

    
2603
assert(src_stride > 2*MB_SIZE + 5);
2604
    if(!sliced && offset_dst)
2605
        dst += src_x + src_y*dst_stride;
2606
    dst8+= src_x + src_y*src_stride;
2607
//    src += src_x + src_y*src_stride;
2608

    
2609
    ptmp= tmp + 3*tmp_step;
2610
    block[0]= ptmp;
2611
    ptmp+=tmp_step;
2612
    pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2613

    
2614
    if(same_block(lt, rt)){
2615
        block[1]= block[0];
2616
    }else{
2617
        block[1]= ptmp;
2618
        ptmp+=tmp_step;
2619
        pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2620
    }
2621

    
2622
    if(same_block(lt, lb)){
2623
        block[2]= block[0];
2624
    }else if(same_block(rt, lb)){
2625
        block[2]= block[1];
2626
    }else{
2627
        block[2]= ptmp;
2628
        ptmp+=tmp_step;
2629
        pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2630
    }
2631

    
2632
    if(same_block(lt, rb) ){
2633
        block[3]= block[0];
2634
    }else if(same_block(rt, rb)){
2635
        block[3]= block[1];
2636
    }else if(same_block(lb, rb)){
2637
        block[3]= block[2];
2638
    }else{
2639
        block[3]= ptmp;
2640
        pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2641
    }
2642
#if 0
2643
    for(y=0; y<b_h; y++){
2644
        for(x=0; x<b_w; x++){
2645
            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2646
            if(add) dst[x + y*dst_stride] += v;
2647
            else    dst[x + y*dst_stride] -= v;
2648
        }
2649
    }
2650
    for(y=0; y<b_h; y++){
2651
        uint8_t *obmc2= obmc + (obmc_stride>>1);
2652
        for(x=0; x<b_w; x++){
2653
            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2654
            if(add) dst[x + y*dst_stride] += v;
2655
            else    dst[x + y*dst_stride] -= v;
2656
        }
2657
    }
2658
    for(y=0; y<b_h; y++){
2659
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2660
        for(x=0; x<b_w; x++){
2661
            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2662
            if(add) dst[x + y*dst_stride] += v;
2663
            else    dst[x + y*dst_stride] -= v;
2664
        }
2665
    }
2666
    for(y=0; y<b_h; y++){
2667
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2668
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2669
        for(x=0; x<b_w; x++){
2670
            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2671
            if(add) dst[x + y*dst_stride] += v;
2672
            else    dst[x + y*dst_stride] -= v;
2673
        }
2674
    }
2675
#else
2676
    if(sliced){
2677
        START_TIMER
2678

    
2679
        s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2680
        STOP_TIMER("inner_add_yblock")
2681
    }else
2682
    for(y=0; y<b_h; y++){
2683
        //FIXME ugly missue of obmc_stride
2684
        const uint8_t *obmc1= obmc + y*obmc_stride;
2685
        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2686
        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2687
        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2688
        for(x=0; x<b_w; x++){
2689
            int v=   obmc1[x] * block[3][x + y*src_stride]
2690
                    +obmc2[x] * block[2][x + y*src_stride]
2691
                    +obmc3[x] * block[1][x + y*src_stride]
2692
                    +obmc4[x] * block[0][x + y*src_stride];
2693

    
2694
            v <<= 8 - LOG2_OBMC_MAX;
2695
            if(FRAC_BITS != 8){
2696
                v += 1<<(7 - FRAC_BITS);
2697
                v >>= 8 - FRAC_BITS;
2698
            }
2699
            if(add){
2700
                v += dst[x + y*dst_stride];
2701
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2702
                if(v&(~255)) v= ~(v>>31);
2703
                dst8[x + y*src_stride] = v;
2704
            }else{
2705
                dst[x + y*dst_stride] -= v;
2706
            }
2707
        }
2708
    }
2709
#endif
2710
}
2711

    
2712
static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2713
    Plane *p= &s->plane[plane_index];
2714
    const int mb_w= s->b_width  << s->block_max_depth;
2715
    const int mb_h= s->b_height << s->block_max_depth;
2716
    int x, y, mb_x;
2717
    int block_size = MB_SIZE >> s->block_max_depth;
2718
    int block_w    = plane_index ? block_size/2 : block_size;
2719
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2720
    int obmc_stride= plane_index ? block_size : 2*block_size;
2721
    int ref_stride= s->current_picture.linesize[plane_index];
2722
    uint8_t *dst8= s->current_picture.data[plane_index];
2723
    int w= p->width;
2724
    int h= p->height;
2725
    START_TIMER
2726

    
2727
    if(s->keyframe || (s->avctx->debug&512)){
2728
        if(mb_y==mb_h)
2729
            return;
2730

    
2731
        if(add){
2732
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2733
            {
2734
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2735
                DWTELEM * line = sb->line[y];
2736
                for(x=0; x<w; x++)
2737
                {
2738
//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2739
                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2740
                    v >>= FRAC_BITS;
2741
                    if(v&(~255)) v= ~(v>>31);
2742
                    dst8[x + y*ref_stride]= v;
2743
                }
2744
            }
2745
        }else{
2746
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2747
            {
2748
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2749
                DWTELEM * line = sb->line[y];
2750
                for(x=0; x<w; x++)
2751
                {
2752
                    line[x] -= 128 << FRAC_BITS;
2753
//                    buf[x + y*w]-= 128<<FRAC_BITS;
2754
                }
2755
            }
2756
        }
2757

    
2758
        return;
2759
    }
2760

    
2761
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2762
            START_TIMER
2763

    
2764
            add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2765
                       block_w*mb_x - block_w/2,
2766
                       block_w*mb_y - block_w/2,
2767
                       block_w, block_w,
2768
                       w, h,
2769
                       w, ref_stride, obmc_stride,
2770
                       mb_x - 1, mb_y - 1,
2771
                       add, 0, plane_index);
2772

    
2773
            STOP_TIMER("add_yblock")
2774
        }
2775

    
2776
    STOP_TIMER("predict_slice")
2777
}
2778

    
2779
static av_always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2780
    Plane *p= &s->plane[plane_index];
2781
    const int mb_w= s->b_width  << s->block_max_depth;
2782
    const int mb_h= s->b_height << s->block_max_depth;
2783
    int x, y, mb_x;
2784
    int block_size = MB_SIZE >> s->block_max_depth;
2785
    int block_w    = plane_index ? block_size/2 : block_size;
2786
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2787
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2788
    int ref_stride= s->current_picture.linesize[plane_index];
2789
    uint8_t *dst8= s->current_picture.data[plane_index];
2790
    int w= p->width;
2791
    int h= p->height;
2792
    START_TIMER
2793

    
2794
    if(s->keyframe || (s->avctx->debug&512)){
2795
        if(mb_y==mb_h)
2796
            return;
2797

    
2798
        if(add){
2799
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2800
                for(x=0; x<w; x++){
2801
                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2802
                    v >>= FRAC_BITS;
2803
                    if(v&(~255)) v= ~(v>>31);
2804
                    dst8[x + y*ref_stride]= v;
2805
                }
2806
            }
2807
        }else{
2808
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2809
                for(x=0; x<w; x++){
2810
                    buf[x + y*w]-= 128<<FRAC_BITS;
2811
                }
2812
            }
2813
        }
2814

    
2815
        return;
2816
    }
2817

    
2818
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2819
            START_TIMER
2820

    
2821
            add_yblock(s, 0, NULL, buf, dst8, obmc,
2822
                       block_w*mb_x - block_w/2,
2823
                       block_w*mb_y - block_w/2,
2824
                       block_w, block_w,
2825
                       w, h,
2826
                       w, ref_stride, obmc_stride,
2827
                       mb_x - 1, mb_y - 1,
2828
                       add, 1, plane_index);
2829

    
2830
            STOP_TIMER("add_yblock")
2831
        }
2832

    
2833
    STOP_TIMER("predict_slice")
2834
}
2835

    
2836
static av_always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2837
    const int mb_h= s->b_height << s->block_max_depth;
2838
    int mb_y;
2839
    for(mb_y=0; mb_y<=mb_h; mb_y++)
2840
        predict_slice(s, buf, plane_index, add, mb_y);
2841
}
2842

    
2843
static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2844
    int i, x2, y2;
2845
    Plane *p= &s->plane[plane_index];
2846
    const int block_size = MB_SIZE >> s->block_max_depth;
2847
    const int block_w    = plane_index ? block_size/2 : block_size;
2848
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2849
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2850
    const int ref_stride= s->current_picture.linesize[plane_index];
2851
    uint8_t *src= s-> input_picture.data[plane_index];
2852
    DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2853
    const int b_stride = s->b_width << s->block_max_depth;
2854
    const int w= p->width;
2855
    const int h= p->height;
2856
    int index= mb_x + mb_y*b_stride;
2857
    BlockNode *b= &s->block[index];
2858
    BlockNode backup= *b;
2859
    int ab=0;
2860
    int aa=0;
2861

    
2862
    b->type|= BLOCK_INTRA;
2863
    b->color[plane_index]= 0;
2864
    memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2865

    
2866
    for(i=0; i<4; i++){
2867
        int mb_x2= mb_x + (i &1) - 1;
2868
        int mb_y2= mb_y + (i>>1) - 1;
2869
        int x= block_w*mb_x2 + block_w/2;
2870
        int y= block_w*mb_y2 + block_w/2;
2871

    
2872
        add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2873
                    x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2874

    
2875
        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2876
            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2877
                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2878
                int obmc_v= obmc[index];
2879
                int d;
2880
                if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2881
                if(x<0) obmc_v += obmc[index + block_w];
2882
                if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2883
                if(x+block_w>w) obmc_v += obmc[index - block_w];
2884
                //FIXME precalc this or simplify it somehow else
2885

    
2886
                d = -dst[index] + (1<<(FRAC_BITS-1));
2887
                dst[index] = d;
2888
                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2889
                aa += obmc_v * obmc_v; //FIXME precalclate this
2890
            }
2891
        }
2892
    }
2893
    *b= backup;
2894

    
2895
    return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we should not need clipping
2896
}
2897

    
2898
static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2899
    const int b_stride = s->b_width << s->block_max_depth;
2900
    const int b_height = s->b_height<< s->block_max_depth;
2901
    int index= x + y*b_stride;
2902
    const BlockNode *b     = &s->block[index];
2903
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2904
    const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2905
    const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2906
    const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2907
    int dmx, dmy;
2908
//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2909
//  int my_context= av_log2(2*FFABS(left->my - top->my));
2910

    
2911
    if(x<0 || x>=b_stride || y>=b_height)
2912
        return 0;
2913
/*
2914
1            0      0
2915
01X          1-2    1
2916
001XX        3-6    2-3
2917
0001XXX      7-14   4-7
2918
00001XXXX   15-30   8-15
2919
*/
2920
//FIXME try accurate rate
2921
//FIXME intra and inter predictors if surrounding blocks arent the same type
2922
    if(b->type & BLOCK_INTRA){
2923
        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2924
                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
2925
                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
2926
    }else{
2927
        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2928
        dmx-= b->mx;
2929
        dmy-= b->my;
2930
        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2931
                    + av_log2(2*FFABS(dmy))
2932
                    + av_log2(2*b->ref));
2933
    }
2934
}
2935

    
2936
static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2937
    Plane *p= &s->plane[plane_index];
2938
    const int block_size = MB_SIZE >> s->block_max_depth;
2939
    const int block_w    = plane_index ? block_size/2 : block_size;
2940
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2941
    const int ref_stride= s->current_picture.linesize[plane_index];
2942
    uint8_t *dst= s->current_picture.data[plane_index];
2943
    uint8_t *src= s->  input_picture.data[plane_index];
2944
    DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2945
    uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2946
    uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
2947
    const int b_stride = s->b_width << s->block_max_depth;
2948
    const int b_height = s->b_height<< s->block_max_depth;
2949
    const int w= p->width;
2950
    const int h= p->height;
2951
    int distortion;
2952
    int rate= 0;
2953
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2954
    int sx= block_w*mb_x - block_w/2;
2955
    int sy= block_w*mb_y - block_w/2;
2956
    int x0= FFMAX(0,-sx);
2957
    int y0= FFMAX(0,-sy);
2958
    int x1= FFMIN(block_w*2, w-sx);
2959
    int y1= FFMIN(block_w*2, h-sy);
2960
    int i,x,y;
2961

    
2962
    pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2963

    
2964
    for(y=y0; y<y1; y++){
2965
        const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2966
        const DWTELEM *pred1 = pred + y*obmc_stride;
2967
        uint8_t *cur1 = cur + y*ref_stride;
2968
        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2969
        for(x=x0; x<x1; x++){
2970
            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2971
            v = (v + pred1[x]) >> FRAC_BITS;
2972
            if(v&(~255)) v= ~(v>>31);
2973
            dst1[x] = v;
2974
        }
2975
    }
2976

    
2977
    /* copy the regions where obmc[] = (uint8_t)256 */
2978
    if(LOG2_OBMC_MAX == 8
2979
        && (mb_x == 0 || mb_x == b_stride-1)
2980
        && (mb_y == 0 || mb_y == b_height-1)){
2981
        if(mb_x == 0)
2982
            x1 = block_w;
2983
        else
2984
            x0 = block_w;
2985
        if(mb_y == 0)
2986
            y1 = block_w;
2987
        else
2988
            y0 = block_w;
2989
        for(y=y0; y<y1; y++)
2990
            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2991
    }
2992

    
2993
    if(block_w==16){
2994
        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
2995
        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
2996
        /* FIXME cmps overlap but don't cover the wavelet's whole support,
2997
         * so improving the score of one block is not strictly guaranteed to
2998
         * improve the score of the whole frame, so iterative motion est
2999
         * doesn't always converge. */
3000
        if(s->avctx->me_cmp == FF_CMP_W97)
3001
            distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3002
        else if(s->avctx->me_cmp == FF_CMP_W53)
3003
            distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3004
        else{
3005
            distortion = 0;
3006
            for(i=0; i<4; i++){
3007
                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3008
                distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3009
            }
3010
        }
3011
    }else{
3012
        assert(block_w==8);
3013
        distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3014
    }
3015

    
3016
    if(plane_index==0){
3017
        for(i=0; i<4; i++){
3018
/* ..RRr
3019
 * .RXx.
3020
 * rxx..
3021
 */
3022
            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3023
        }
3024
        if(mb_x == b_stride-2)
3025
            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3026
    }
3027
    return distortion + rate*penalty_factor;
3028
}
3029

    
3030
static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3031
    int i, y2;
3032
    Plane *p= &s->plane[plane_index];
3033
    const int block_size = MB_SIZE >> s->block_max_depth;
3034
    const int block_w    = plane_index ? block_size/2 : block_size;
3035
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3036
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3037
    const int ref_stride= s->current_picture.linesize[plane_index];
3038
    uint8_t *dst= s->current_picture.data[plane_index];
3039
    uint8_t *src= s-> input_picture.data[plane_index];
3040
    static const DWTELEM zero_dst[4096]; //FIXME
3041
    const int b_stride = s->b_width << s->block_max_depth;
3042
    const int w= p->width;
3043
    const int h= p->height;
3044
    int distortion= 0;
3045
    int rate= 0;
3046
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3047

    
3048
    for(i=0; i<9; i++){
3049
        int mb_x2= mb_x + (i%3) - 1;
3050
        int mb_y2= mb_y + (i/3) - 1;
3051
        int x= block_w*mb_x2 + block_w/2;
3052
        int y= block_w*mb_y2 + block_w/2;
3053

    
3054
        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
3055
                   x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3056

    
3057
        //FIXME find a cleaner/simpler way to skip the outside stuff
3058
        for(y2= y; y2<0; y2++)
3059
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3060
        for(y2= h; y2<y+block_w; y2++)
3061
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3062
        if(x<0){
3063
            for(y2= y; y2<y+block_w; y2++)
3064
                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3065
        }
3066
        if(x+block_w > w){
3067
            for(y2= y; y2<y+block_w; y2++)
3068
                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3069
        }
3070

    
3071
        assert(block_w== 8 || block_w==16);
3072
        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3073
    }
3074

    
3075
    if(plane_index==0){
3076
        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3077
        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3078

    
3079
/* ..RRRr
3080
 * .RXXx.
3081
 * .RXXx.
3082
 * rxxx.
3083
 */
3084
        if(merged)
3085
            rate = get_block_bits(s, mb_x, mb_y, 2);
3086
        for(i=merged?4:0; i<9; i++){
3087
            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3088
            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3089
        }
3090
    }
3091
    return distortion + rate*penalty_factor;
3092
}
3093

    
3094
static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3095
    const int b_stride= s->b_width << s->block_max_depth;
3096
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3097
    BlockNode backup= *block;
3098
    int rd, index, value;
3099

    
3100
    assert(mb_x>=0 && mb_y>=0);
3101
    assert(mb_x<b_stride);
3102

    
3103
    if(intra){
3104
        block->color[0] = p[0];
3105
        block->color[1] = p[1];
3106
        block->color[2] = p[2];
3107
        block->type |= BLOCK_INTRA;
3108
    }else{
3109
        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3110
        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3111
        if(s->me_cache[index] == value)
3112
            return 0;
3113
        s->me_cache[index]= value;
3114

    
3115
        block->mx= p[0];
3116
        block->my= p[1];
3117
        block->type &= ~BLOCK_INTRA;
3118
    }
3119

    
3120
    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3121

    
3122
//FIXME chroma
3123
    if(rd < *best_rd){
3124
        *best_rd= rd;
3125
        return 1;
3126
    }else{
3127
        *block= backup;
3128
        return 0;
3129
    }
3130
}
3131

    
3132
/* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3133
static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3134
    int p[2] = {p0, p1};
3135
    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3136
}
3137

    
3138
static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3139
    const int b_stride= s->b_width << s->block_max_depth;
3140
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3141
    BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3142
    int rd, index, value;
3143

    
3144
    assert(mb_x>=0 && mb_y>=0);
3145
    assert(mb_x<b_stride);
3146
    assert(((mb_x|mb_y)&1) == 0);
3147

    
3148
    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3149
    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3150
    if(s->me_cache[index] == value)
3151
        return 0;
3152
    s->me_cache[index]= value;
3153

    
3154
    block->mx= p0;
3155
    block->my= p1;
3156
    block->ref= ref;
3157
    block->type &= ~BLOCK_INTRA;
3158
    block[1]= block[b_stride]= block[b_stride+1]= *block;
3159

    
3160
    rd= get_4block_rd(s, mb_x, mb_y, 0);
3161

    
3162
//FIXME chroma
3163
    if(rd < *best_rd){
3164
        *best_rd= rd;
3165
        return 1;
3166
    }else{
3167
        block[0]= backup[0];
3168
        block[1]= backup[1];
3169
        block[b_stride]= backup[2];
3170
        block[b_stride+1]= backup[3];
3171
        return 0;
3172
    }
3173
}
3174

    
3175
static void iterative_me(SnowContext *s){
3176
    int pass, mb_x, mb_y;
3177
    const int b_width = s->b_width  << s->block_max_depth;
3178
    const int b_height= s->b_height << s->block_max_depth;
3179
    const int b_stride= b_width;
3180
    int color[3];
3181

    
3182
    {
3183
        RangeCoder r = s->c;
3184
        uint8_t state[sizeof(s->block_state)];
3185
        memcpy(state, s->block_state, sizeof(s->block_state));
3186
        for(mb_y= 0; mb_y<s->b_height; mb_y++)
3187
            for(mb_x= 0; mb_x<s->b_width; mb_x++)
3188
                encode_q_branch(s, 0, mb_x, mb_y);
3189
        s->c = r;
3190
        memcpy(s->block_state, state, sizeof(s->block_state));
3191
    }
3192

    
3193
    for(pass=0; pass<25; pass++){
3194
        int change= 0;
3195

    
3196
        for(mb_y= 0; mb_y<b_height; mb_y++){
3197
            for(mb_x= 0; mb_x<b_width; mb_x++){
3198
                int dia_change, i, j, ref;
3199
                int best_rd= INT_MAX, ref_rd;
3200
                BlockNode backup, ref_b;
3201
                const int index= mb_x + mb_y * b_stride;
3202
                BlockNode *block= &s->block[index];
3203
                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3204
                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3205
                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3206
                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3207
                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3208
                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3209
                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3210
                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3211
                const int b_w= (MB_SIZE >> s->block_max_depth);
3212
                uint8_t obmc_edged[b_w*2][b_w*2];
3213

    
3214
                if(pass && (block->type & BLOCK_OPT))
3215
                    continue;
3216
                block->type |= BLOCK_OPT;
3217

    
3218
                backup= *block;
3219

    
3220
                if(!s->me_cache_generation)
3221
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3222
                s->me_cache_generation += 1<<22;
3223

    
3224
                //FIXME precalc
3225
                {
3226
                    int x, y;
3227
                    memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3228
                    if(mb_x==0)
3229
                        for(y=0; y<b_w*2; y++)
3230
                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3231
                    if(mb_x==b_stride-1)
3232
                        for(y=0; y<b_w*2; y++)
3233
                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3234
                    if(mb_y==0){
3235
                        for(x=0; x<b_w*2; x++)
3236
                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
3237
                        for(y=1; y<b_w; y++)
3238
                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3239
                    }
3240
                    if(mb_y==b_height-1){
3241
                        for(x=0; x<b_w*2; x++)
3242
                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3243
                        for(y=b_w; y<b_w*2-1; y++)
3244
                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3245
                    }
3246
                }
3247

    
3248
                //skip stuff outside the picture
3249
                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3250
                {
3251
                    uint8_t *src= s->  input_picture.data[0];
3252
                    uint8_t *dst= s->current_picture.data[0];
3253
                    const int stride= s->current_picture.linesize[0];
3254
                    const int block_w= MB_SIZE >> s->block_max_depth;
3255
                    const int sx= block_w*mb_x - block_w/2;
3256
                    const int sy= block_w*mb_y - block_w/2;
3257
                    const int w= s->plane[0].width;
3258
                    const int h= s->plane[0].height;
3259
                    int y;
3260

    
3261
                    for(y=sy; y<0; y++)
3262
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3263
                    for(y=h; y<sy+block_w*2; y++)
3264
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3265
                    if(sx<0){
3266
                        for(y=sy; y<sy+block_w*2; y++)
3267
                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3268
                    }
3269
                    if(sx+block_w*2 > w){
3270
                        for(y=sy; y<sy+block_w*2; y++)
3271
                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3272
                    }
3273
                }
3274

    
3275
                // intra(black) = neighbors' contribution to the current block
3276
                for(i=0; i<3; i++)
3277
                    color[i]= get_dc(s, mb_x, mb_y, i);
3278

    
3279
                // get previous score (cannot be cached due to OBMC)
3280
                if(pass > 0 && (block->type&BLOCK_INTRA)){
3281
                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
3282
                    check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3283
                }else
3284
                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3285

    
3286
                ref_b= *block;
3287
                ref_rd= best_rd;
3288
                for(ref=0; ref < s->ref_frames; ref++){
3289
                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3290
                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3291
                        continue;
3292
                    block->ref= ref;
3293
                    best_rd= INT_MAX;
3294

    
3295
                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3296
                    check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3297
                    if(tb)
3298
                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3299
                    if(lb)
3300
                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3301
                    if(rb)
3302
                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3303
                    if(bb)
3304
                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3305

    
3306
                    /* fullpel ME */
3307
                    //FIXME avoid subpel interpol / round to nearest integer
3308
                    do{
3309
                        dia_change=0;
3310
                        for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3311
                            for(j=0; j<i; j++){
3312
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3313
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3314
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3315
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3316
                            }
3317
                        }
3318
                    }while(dia_change);
3319
                    /* subpel ME */
3320
                    do{
3321
                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3322
                        dia_change=0;
3323
                        for(i=0; i<8; i++)
3324
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3325
                    }while(dia_change);
3326
                    //FIXME or try the standard 2 pass qpel or similar
3327

    
3328
                    mvr[0][0]= block->mx;
3329
                    mvr[0][1]= block->my;
3330
                    if(ref_rd > best_rd){
3331
                        ref_rd= best_rd;
3332
                        ref_b= *block;
3333
                    }
3334
                }
3335
                best_rd= ref_rd;
3336
                *block= ref_b;
3337
#if 1
3338
                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3339
                //FIXME RD style color selection
3340
#endif
3341
                if(!same_block(block, &backup)){
3342
                    if(tb ) tb ->type &= ~BLOCK_OPT;
3343
                    if(lb ) lb ->type &= ~BLOCK_OPT;
3344
                    if(rb ) rb ->type &= ~BLOCK_OPT;
3345
                    if(bb ) bb ->type &= ~BLOCK_OPT;
3346
                    if(tlb) tlb->type &= ~BLOCK_OPT;
3347
                    if(trb) trb->type &= ~BLOCK_OPT;
3348
                    if(blb) blb->type &= ~BLOCK_OPT;
3349
                    if(brb) brb->type &= ~BLOCK_OPT;
3350
                    change ++;
3351
                }
3352
            }
3353
        }
3354
        av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3355
        if(!change)
3356
            break;
3357
    }
3358

    
3359
    if(s->block_max_depth == 1){
3360
        int change= 0;
3361
        for(mb_y= 0; mb_y<b_height; mb_y+=2){
3362
            for(mb_x= 0; mb_x<b_width; mb_x+=2){
3363
                int i;
3364
                int best_rd, init_rd;
3365
                const int index= mb_x + mb_y * b_stride;
3366
                BlockNode *b[4];
3367

    
3368
                b[0]= &s->block[index];
3369
                b[1]= b[0]+1;
3370
                b[2]= b[0]+b_stride;
3371
                b[3]= b[2]+1;
3372
                if(same_block(b[0], b[1]) &&
3373
                   same_block(b[0], b[2]) &&
3374
                   same_block(b[0], b[3]))
3375
                    continue;
3376

    
3377
                if(!s->me_cache_generation)
3378
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3379
                s->me_cache_generation += 1<<22;
3380

    
3381
                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3382

    
3383
                //FIXME more multiref search?
3384
                check_4block_inter(s, mb_x, mb_y,
3385
                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3386
                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3387

    
3388
                for(i=0; i<4; i++)
3389
                    if(!(b[i]->type&BLOCK_INTRA))
3390
                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3391

    
3392
                if(init_rd != best_rd)
3393
                    change++;
3394
            }
3395
        }
3396
        av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3397
    }
3398
}
3399

    
3400
static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3401
    const int level= b->level;
3402
    const int w= b->width;
3403
    const int h= b->height;
3404
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3405
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3406
    int x,y, thres1, thres2;
3407
//    START_TIMER
3408

    
3409
    if(s->qlog == LOSSLESS_QLOG) return;
3410

    
3411
    bias= bias ? 0 : (3*qmul)>>3;
3412
    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3413
    thres2= 2*thres1;
3414

    
3415
    if(!bias){
3416
        for(y=0; y<h; y++){
3417
            for(x=0; x<w; x++){
3418
                int i= src[x + y*stride];
3419

    
3420
                if((unsigned)(i+thres1) > thres2){
3421
                    if(i>=0){
3422
                        i<<= QEXPSHIFT;
3423
                        i/= qmul; //FIXME optimize
3424
                        src[x + y*stride]=  i;
3425
                    }else{
3426
                        i= -i;
3427
                        i<<= QEXPSHIFT;
3428
                        i/= qmul; //FIXME optimize
3429
                        src[x + y*stride]= -i;
3430
                    }
3431
                }else
3432
                    src[x + y*stride]= 0;
3433
            }
3434
        }
3435
    }else{
3436
        for(y=0; y<h; y++){
3437
            for(x=0; x<w; x++){
3438
                int i= src[x + y*stride];
3439

    
3440
                if((unsigned)(i+thres1) > thres2){
3441
                    if(i>=0){
3442
                        i<<= QEXPSHIFT;
3443
                        i= (i + bias) / qmul; //FIXME optimize
3444
                        src[x + y*stride]=  i;
3445
                    }else{
3446
                        i= -i;
3447
                        i<<= QEXPSHIFT;
3448
                        i= (i + bias) / qmul; //FIXME optimize
3449
                        src[x + y*stride]= -i;
3450
                    }
3451
                }else
3452
                    src[x + y*stride]= 0;
3453
            }
3454
        }
3455
    }
3456
    if(level+1 == s->spatial_decomposition_count){
3457
//        STOP_TIMER("quantize")
3458
    }
3459
}
3460

    
3461
static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3462
    const int w= b->width;
3463
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3464
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3465
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3466
    int x,y;
3467
    START_TIMER
3468

    
3469
    if(s->qlog == LOSSLESS_QLOG) return;
3470

    
3471
    for(y=start_y; y<end_y; y++){
3472
//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3473
        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3474
        for(x=0; x<w; x++){
3475
            int i= line[x];
3476
            if(i<0){
3477
                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3478
            }else if(i>0){
3479
                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3480
            }
3481
        }
3482
    }
3483
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3484
        STOP_TIMER("dquant")
3485
    }
3486
}
3487

    
3488
static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3489
    const int w= b->width;
3490
    const int h= b->height;
3491
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3492
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3493
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3494
    int x,y;
3495
    START_TIMER
3496

    
3497
    if(s->qlog == LOSSLESS_QLOG) return;
3498

    
3499
    for(y=0; y<h; y++){
3500
        for(x=0; x<w; x++){
3501
            int i= src[x + y*stride];
3502
            if(i<0){
3503
                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3504
            }else if(i>0){
3505
                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3506
            }
3507
        }
3508
    }
3509
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3510
        STOP_TIMER("dquant")
3511
    }
3512
}
3513

    
3514
static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3515
    const int w= b->width;
3516
    const int h= b->height;
3517
    int x,y;
3518

    
3519
    for(y=h-1; y>=0; y--){
3520
        for(x=w-1; x>=0; x--){
3521
            int i= x + y*stride;
3522

    
3523
            if(x){
3524
                if(use_median){
3525
                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3526
                    else  src[i] -= src[i - 1];
3527
                }else{
3528
                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3529
                    else  src[i] -= src[i - 1];
3530
                }
3531
            }else{
3532
                if(y) src[i] -= src[i - stride];
3533
            }
3534
        }
3535
    }
3536
}
3537

    
3538
static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3539
    const int w= b->width;
3540
    int x,y;
3541

    
3542
//    START_TIMER
3543

    
3544
    DWTELEM * line=0; // silence silly "could be used without having been initialized" warning
3545
    DWTELEM * prev;
3546

    
3547
    if (start_y != 0)
3548
        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3549

    
3550
    for(y=start_y; y<end_y; y++){
3551
        prev = line;
3552
//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3553
        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3554
        for(x=0; x<w; x++){
3555
            if(x){
3556
                if(use_median){
3557
                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3558
                    else  line[x] += line[x - 1];
3559
                }else{
3560
                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3561
                    else  line[x] += line[x - 1];
3562
                }
3563
            }else{
3564
                if(y) line[x] += prev[x];
3565
            }
3566
        }
3567
    }
3568

    
3569
//    STOP_TIMER("correlate")
3570
}
3571

    
3572
static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3573
    const int w= b->width;
3574
    const int h= b->height;
3575
    int x,y;
3576

    
3577
    for(y=0; y<h; y++){
3578
        for(x=0; x<w; x++){
3579
            int i= x + y*stride;
3580

    
3581
            if(x){
3582
                if(use_median){
3583
                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3584
                    else  src[i] += src[i - 1];
3585
                }else{
3586
                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3587
                    else  src[i] += src[i - 1];
3588
                }
3589
            }else{
3590
                if(y) src[i] += src[i - stride];
3591
            }
3592
        }
3593
    }
3594
}
3595

    
3596
static void encode_header(SnowContext *s){
3597
    int plane_index, level, orientation;
3598
    uint8_t kstate[32];
3599

    
3600
    memset(kstate, MID_STATE, sizeof(kstate));
3601

    
3602
    put_rac(&s->c, kstate, s->keyframe);
3603
    if(s->keyframe || s->always_reset){
3604
        reset_contexts(s);
3605
        s->last_spatial_decomposition_type=
3606
        s->last_qlog=
3607
        s->last_qbias=
3608
        s->last_mv_scale=
3609
        s->last_block_max_depth= 0;
3610
    }
3611
    if(s->keyframe){
3612
        put_symbol(&s->c, s->header_state, s->version, 0);
3613
        put_rac(&s->c, s->header_state, s->always_reset);
3614
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3615
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3616
        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3617
        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3618
        put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3619
        put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3620
        put_rac(&s->c, s->header_state, s->spatial_scalability);
3621
//        put_rac(&s->c, s->header_state, s->rate_scalability);
3622
        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3623

    
3624
        for(plane_index=0; plane_index<2; plane_index++){
3625
            for(level=0; level<s->spatial_decomposition_count; level++){
3626
                for(orientation=level ? 1:0; orientation<4; orientation++){
3627
                    if(orientation==2) continue;
3628
                    put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3629
                }
3630
            }
3631
        }
3632
    }
3633
    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3634
    put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3635
    put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3636
    put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3637
    put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3638

    
3639
    s->last_spatial_decomposition_type= s->spatial_decomposition_type;
3640
    s->last_qlog                      = s->qlog;
3641
    s->last_qbias                     = s->qbias;
3642
    s->last_mv_scale                  = s->mv_scale;
3643
    s->last_block_max_depth           = s->block_max_depth;
3644
}
3645

    
3646
static int decode_header(SnowContext *s){
3647
    int plane_index, level, orientation;
3648
    uint8_t kstate[32];
3649

    
3650
    memset(kstate, MID_STATE, sizeof(kstate));
3651

    
3652
    s->keyframe= get_rac(&s->c, kstate);
3653
    if(s->keyframe || s->always_reset){
3654
        reset_contexts(s);
3655
        s->spatial_decomposition_type=
3656
        s->qlog=
3657
        s->qbias=
3658
        s->mv_scale=
3659
        s->block_max_depth= 0;
3660
    }
3661
    if(s->keyframe){
3662
        s->version= get_symbol(&s->c, s->header_state, 0);
3663
        if(s->version>0){
3664
            av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3665
            return -1;
3666
        }
3667
        s->always_reset= get_rac(&s->c, s->header_state);
3668
        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3669
        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3670
        s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3671
        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3672
        s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3673
        s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3674
        s->spatial_scalability= get_rac(&s->c, s->header_state);
3675
//        s->rate_scalability= get_rac(&s->c, s->header_state);
3676
        s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3677

    
3678
        for(plane_index=0; plane_index<3; plane_index++){
3679
            for(level=0; level<s->spatial_decomposition_count; level++){
3680
                for(orientation=level ? 1:0; orientation<4; orientation++){
3681
                    int q;
3682
                    if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3683
                    else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3684
                    else                    q= get_symbol(&s->c, s->header_state, 1);
3685
                    s->plane[plane_index].band[level][orientation].qlog= q;
3686
                }
3687
            }
3688
        }
3689
    }
3690

    
3691
    s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3692
    if(s->spatial_decomposition_type > 2){
3693
        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3694
        return -1;
3695
    }
3696

    
3697
    s->qlog           += get_symbol(&s->c, s->header_state, 1);
3698
    s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3699
    s->qbias          += get_symbol(&s->c, s->header_state, 1);
3700
    s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3701
    if(s->block_max_depth > 1 || s->block_max_depth < 0){
3702
        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3703
        s->block_max_depth= 0;
3704
        return -1;
3705
    }
3706

    
3707
    return 0;
3708
}
3709

    
3710
static void init_qexp(void){
3711
    int i;
3712
    double v=128;
3713

    
3714
    for(i=0; i<QROOT; i++){
3715
        qexp[i]= lrintf(v);
3716
        v *= pow(2, 1.0 / QROOT);
3717
    }
3718
}
3719

    
3720
static int common_init(AVCodecContext *avctx){
3721
    SnowContext *s = avctx->priv_data;
3722
    int width, height;
3723
    int level, orientation, plane_index, dec;
3724
    int i, j;
3725

    
3726
    s->avctx= avctx;
3727

    
3728
    dsputil_init(&s->dsp, avctx);
3729

    
3730
#define mcf(dx,dy)\
3731
    s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3732
    s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3733
        s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3734
    s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3735
    s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3736
        s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3737

    
3738
    mcf( 0, 0)
3739
    mcf( 4, 0)
3740
    mcf( 8, 0)
3741
    mcf(12, 0)
3742
    mcf( 0, 4)
3743
    mcf( 4, 4)
3744
    mcf( 8, 4)
3745
    mcf(12, 4)
3746
    mcf( 0, 8)
3747
    mcf( 4, 8)
3748
    mcf( 8, 8)
3749
    mcf(12, 8)
3750
    mcf( 0,12)
3751
    mcf( 4,12)
3752
    mcf( 8,12)
3753
    mcf(12,12)
3754

    
3755
#define mcfh(dx,dy)\
3756
    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3757
    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3758
        mc_block_hpel ## dx ## dy ## 16;\
3759
    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3760
    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3761
        mc_block_hpel ## dx ## dy ## 8;
3762

    
3763
    mcfh(0, 0)
3764
    mcfh(8, 0)
3765
    mcfh(0, 8)
3766
    mcfh(8, 8)
3767

    
3768
    if(!qexp[0])
3769
        init_qexp();
3770

    
3771
    dec= s->spatial_decomposition_count= 5;
3772
    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3773

    
3774
    s->chroma_h_shift= 1; //FIXME XXX
3775
    s->chroma_v_shift= 1;
3776

    
3777
//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3778

    
3779
    width= s->avctx->width;
3780
    height= s->avctx->height;
3781

    
3782
    s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3783

    
3784
    s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3785
    s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3786

    
3787
    for(plane_index=0; plane_index<3; plane_index++){
3788
        int w= s->avctx->width;
3789
        int h= s->avctx->height;
3790

    
3791
        if(plane_index){
3792
            w>>= s->chroma_h_shift;
3793
            h>>= s->chroma_v_shift;
3794
        }
3795
        s->plane[plane_index].width = w;
3796
        s->plane[plane_index].height= h;
3797
//av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3798
        for(level=s->spatial_decomposition_count-1; level>=0; level--){
3799
            for(orientation=level ? 1 : 0; orientation<4; orientation++){
3800
                SubBand *b= &s->plane[plane_index].band[level][orientation];
3801

    
3802
                b->buf= s->spatial_dwt_buffer;
3803
                b->level= level;
3804
                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3805
                b->width = (w + !(orientation&1))>>1;
3806
                b->height= (h + !(orientation>1))>>1;
3807

    
3808
                b->stride_line = 1 << (s->spatial_decomposition_count - level);
3809
                b->buf_x_offset = 0;
3810
                b->buf_y_offset = 0;
3811

    
3812
                if(orientation&1){
3813
                    b->buf += (w+1)>>1;
3814
                    b->buf_x_offset = (w+1)>>1;
3815
                }
3816
                if(orientation>1){
3817
                    b->buf += b->stride>>1;
3818
                    b->buf_y_offset = b->stride_line >> 1;
3819
                }
3820

    
3821
                if(level)
3822
                    b->parent= &s->plane[plane_index].band[level-1][orientation];
3823
                b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3824
            }
3825
            w= (w+1)>>1;
3826
            h= (h+1)>>1;
3827
        }
3828
    }
3829

    
3830
    for(i=0; i<MAX_REF_FRAMES; i++)
3831
        for(j=0; j<MAX_REF_FRAMES; j++)
3832
            scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3833

    
3834
    reset_contexts(s);
3835
/*
3836
    width= s->width= avctx->width;
3837
    height= s->height= avctx->height;
3838

3839
    assert(width && height);
3840
*/
3841
    s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3842

    
3843
    return 0;
3844
}
3845

    
3846
static int qscale2qlog(int qscale){
3847
    return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3848
           + 61*QROOT/8; //<64 >60
3849
}
3850

    
3851
static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3852
{
3853
    /* estimate the frame's complexity as a sum of weighted dwt coefs.
3854
     * FIXME we know exact mv bits at this point,
3855
     * but ratecontrol isn't set up to include them. */
3856
    uint32_t coef_sum= 0;
3857
    int level, orientation, delta_qlog;
3858

    
3859
    for(level=0; level<s->spatial_decomposition_count; level++){
3860
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3861
            SubBand *b= &s->plane[0].band[level][orientation];
3862
            DWTELEM *buf= b->buf;
3863
            const int w= b->width;
3864
            const int h= b->height;
3865
            const int stride= b->stride;
3866
            const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3867
            const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3868
            const int qdiv= (1<<16)/qmul;
3869
            int x, y;
3870
            if(orientation==0)
3871
                decorrelate(s, b, buf, stride, 1, 0);
3872
            for(y=0; y<h; y++)
3873
                for(x=0; x<w; x++)
3874
                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3875
            if(orientation==0)
3876
                correlate(s, b, buf, stride, 1, 0);
3877
        }
3878
    }
3879

    
3880
    /* ugly, ratecontrol just takes a sqrt again */
3881
    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3882
    assert(coef_sum < INT_MAX);
3883

    
3884
    if(pict->pict_type == I_TYPE){
3885
        s->m.current_picture.mb_var_sum= coef_sum;
3886
        s->m.current_picture.mc_mb_var_sum= 0;
3887
    }else{
3888
        s->m.current_picture.mc_mb_var_sum= coef_sum;
3889
        s->m.current_picture.mb_var_sum= 0;
3890
    }
3891

    
3892
    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3893
    if (pict->quality < 0)
3894
        return INT_MIN;
3895
    s->lambda= pict->quality * 3/2;
3896
    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3897
    s->qlog+= delta_qlog;
3898
    return delta_qlog;
3899
}
3900

    
3901
static void calculate_vissual_weight(SnowContext *s, Plane *p){
3902
    int width = p->width;
3903
    int height= p->height;
3904
    int level, orientation, x, y;
3905

    
3906
    for(level=0; level<s->spatial_decomposition_count; level++){
3907
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3908
            SubBand *b= &p->band[level][orientation];
3909
            DWTELEM *buf= b->buf;
3910
            int64_t error=0;
3911

    
3912
            memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3913
            buf[b->width/2 + b->height/2*b->stride]= 256*256;
3914
            ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3915
            for(y=0; y<height; y++){
3916
                for(x=0; x<width; x++){
3917
                    int64_t d= s->spatial_dwt_buffer[x + y*width];
3918
                    error += d*d;
3919
                }
3920
            }
3921

    
3922
            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3923
//            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3924
        }
3925
    }
3926
}
3927

    
3928
static int encode_init(AVCodecContext *avctx)
3929
{
3930
    SnowContext *s = avctx->priv_data;
3931
    int plane_index;
3932

    
3933
    if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3934
        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3935
               "use vstrict=-2 / -strict -2 to use it anyway\n");
3936
        return -1;
3937
    }
3938

    
3939
    if(avctx->prediction_method == DWT_97
3940
       && (avctx->flags & CODEC_FLAG_QSCALE)
3941
       && avctx->global_quality == 0){
3942
        av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
3943
        return -1;
3944
    }
3945

    
3946
    common_init(avctx);
3947
    alloc_blocks(s);
3948

    
3949
    s->version=0;
3950

    
3951
    s->m.avctx   = avctx;
3952
    s->m.flags   = avctx->flags;
3953
    s->m.bit_rate= avctx->bit_rate;
3954

    
3955
    s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3956
    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3957
    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3958
    s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3959
    h263_encode_init(&s->m); //mv_penalty
3960

    
3961
    s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
3962

    
3963
    if(avctx->flags&CODEC_FLAG_PASS1){
3964
        if(!avctx->stats_out)
3965
            avctx->stats_out = av_mallocz(256);
3966
    }
3967
    if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
3968
        if(ff_rate_control_init(&s->m) < 0)
3969
            return -1;
3970
    }
3971
    s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
3972

    
3973
    for(plane_index=0; plane_index<3; plane_index++){
3974
        calculate_vissual_weight(s, &s->plane[plane_index]);
3975
    }
3976

    
3977

    
3978
    avctx->coded_frame= &s->current_picture;
3979
    switch(avctx->pix_fmt){
3980
//    case PIX_FMT_YUV444P:
3981
//    case PIX_FMT_YUV422P:
3982
    case PIX_FMT_YUV420P:
3983
    case PIX_FMT_GRAY8:
3984
//    case PIX_FMT_YUV411P:
3985
//    case PIX_FMT_YUV410P:
3986
        s->colorspace_type= 0;
3987
        break;
3988
/*    case PIX_FMT_RGB32:
3989
        s->colorspace= 1;
3990
        break;*/
3991
    default:
3992
        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
3993
        return -1;
3994
    }
3995
//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
3996
    s->chroma_h_shift= 1;
3997
    s->chroma_v_shift= 1;
3998

    
3999
    ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4000
    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4001

    
4002
    s->avctx->get_buffer(s->avctx, &s->input_picture);
4003

    
4004
    if(s->avctx->me_method == ME_ITER){
4005
        int i;
4006
        int size= s->b_width * s->b_height << 2*s->block_max_depth;
4007
        for(i=0; i<s->max_ref_frames; i++){
4008
            s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4009
            s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4010
        }
4011
    }
4012

    
4013
    return 0;
4014
}
4015

    
4016
static int frame_start(SnowContext *s){
4017
   AVFrame tmp;
4018
   int w= s->avctx->width; //FIXME round up to x16 ?
4019
   int h= s->avctx->height;
4020

    
4021
    if(s->current_picture.data[0]){
4022
        draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4023
        draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4024
        draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4025
    }
4026

    
4027
    tmp= s->last_picture[s->max_ref_frames-1];
4028
    memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4029
    s->last_picture[0]= s->current_picture;
4030
    s->current_picture= tmp;
4031

    
4032
    if(s->keyframe){
4033
        s->ref_frames= 0;
4034
    }else{
4035
        int i;
4036
        for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4037
            if(i && s->last_picture[i-1].key_frame)
4038
                break;
4039
        s->ref_frames= i;
4040
    }
4041

    
4042
    s->current_picture.reference= 1;