Statistics
| Branch: | Revision:

ffmpeg / libavcodec / snow.c @ b538791b

History | View | Annotate | Download (168 KB)

1
/*
2
 * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "avcodec.h"
22
#include "dsputil.h"
23
#include "snow.h"
24

    
25
#include "rangecoder.h"
26

    
27
#include "mpegvideo.h"
28

    
29
#undef NDEBUG
30
#include <assert.h>
31

    
32
static const int8_t quant3[256]={
33
 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
42
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
43
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
44
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
45
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
46
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
47
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
48
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
49
};
50
static const int8_t quant3b[256]={
51
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
60
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
61
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
62
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
63
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
64
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
65
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
66
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
67
};
68
static const int8_t quant3bA[256]={
69
 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
70
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
71
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
72
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
73
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
74
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
75
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
76
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
77
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
78
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
79
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
80
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
81
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
82
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
83
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
84
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
85
};
86
static const int8_t quant5[256]={
87
 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
88
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
91
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
92
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
93
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
94
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
95
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
96
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
97
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
98
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
99
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
100
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
101
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
102
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
103
};
104
static const int8_t quant7[256]={
105
 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107
 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
108
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
109
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
110
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
111
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
112
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
113
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
114
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
115
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
116
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
117
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
118
-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
119
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
120
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
121
};
122
static const int8_t quant9[256]={
123
 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
124
 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
125
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
126
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
127
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
128
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
129
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
130
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
131
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
132
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
133
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
134
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
135
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
136
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
137
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
138
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
139
};
140
static const int8_t quant11[256]={
141
 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
142
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
143
 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
144
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
145
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
146
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
147
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
148
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
149
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
150
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
151
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
152
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
153
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
154
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
155
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
156
-4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
157
};
158
static const int8_t quant13[256]={
159
 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
160
 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
161
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
162
 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
163
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
164
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
165
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
166
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
167
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
168
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
169
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
170
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
171
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
172
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
173
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
174
-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
175
};
176

    
177
#if 0 //64*cubic
178
static const uint8_t obmc32[1024]={
179
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
181
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
182
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
183
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
184
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
185
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
186
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
187
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
188
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
189
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
190
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
191
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
192
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
193
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
194
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
195
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
196
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
197
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
198
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
199
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
200
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
201
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
202
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
203
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
204
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
205
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
206
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
207
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
208
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
209
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
210
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
211
//error:0.000022
212
};
213
static const uint8_t obmc16[256]={
214
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
215
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
216
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
217
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
218
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
219
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
220
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
221
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
222
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
223
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
224
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
225
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
226
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
227
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
228
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
229
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
230
//error:0.000033
231
};
232
#elif 1 // 64*linear
233
static const uint8_t obmc32[1024]={
234
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
235
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
236
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
237
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
238
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
239
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
240
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
241
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
242
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
243
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
244
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
245
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
246
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
247
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
248
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
249
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
250
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
251
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
252
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
253
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
254
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
255
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
256
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
257
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
258
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
259
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
260
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
261
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
262
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
263
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
264
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
265
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
266
 //error:0.000020
267
};
268
static const uint8_t obmc16[256]={
269
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
270
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
271
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
272
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
273
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
274
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
275
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
276
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
277
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
278
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
279
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
280
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
281
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
282
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
283
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
284
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
285
//error:0.000015
286
};
287
#else //64*cos
288
static const uint8_t obmc32[1024]={
289
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
290
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
291
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
292
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
293
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
294
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
295
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
296
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
297
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
298
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
299
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
300
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
301
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
302
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
303
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
304
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
305
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
306
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
307
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
308
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
309
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
310
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
311
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
312
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
313
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
314
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
315
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
316
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
317
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
318
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
319
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
320
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
321
//error:0.000022
322
};
323
static const uint8_t obmc16[256]={
324
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
325
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
326
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
327
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
328
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
329
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
330
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
331
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
332
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
333
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
334
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
335
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
336
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
337
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
338
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
339
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
340
//error:0.000022
341
};
342
#endif
343

    
344
//linear *64
345
static const uint8_t obmc8[64]={
346
  4, 12, 20, 28, 28, 20, 12,  4,
347
 12, 36, 60, 84, 84, 60, 36, 12,
348
 20, 60,100,140,140,100, 60, 20,
349
 28, 84,140,196,196,140, 84, 28,
350
 28, 84,140,196,196,140, 84, 28,
351
 20, 60,100,140,140,100, 60, 20,
352
 12, 36, 60, 84, 84, 60, 36, 12,
353
  4, 12, 20, 28, 28, 20, 12,  4,
354
//error:0.000000
355
};
356

    
357
//linear *64
358
static const uint8_t obmc4[16]={
359
 16, 48, 48, 16,
360
 48,144,144, 48,
361
 48,144,144, 48,
362
 16, 48, 48, 16,
363
//error:0.000000
364
};
365

    
366
static const uint8_t *obmc_tab[4]={
367
    obmc32, obmc16, obmc8, obmc4
368
};
369

    
370
static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
371

    
372
typedef struct BlockNode{
373
    int16_t mx;
374
    int16_t my;
375
    uint8_t ref;
376
    uint8_t color[3];
377
    uint8_t type;
378
//#define TYPE_SPLIT    1
379
#define BLOCK_INTRA   1
380
#define BLOCK_OPT     2
381
//#define TYPE_NOCOLOR  4
382
    uint8_t level; //FIXME merge into type?
383
}BlockNode;
384

    
385
static const BlockNode null_block= { //FIXME add border maybe
386
    .color= {128,128,128},
387
    .mx= 0,
388
    .my= 0,
389
    .ref= 0,
390
    .type= 0,
391
    .level= 0,
392
};
393

    
394
#define LOG2_MB_SIZE 4
395
#define MB_SIZE (1<<LOG2_MB_SIZE)
396
#define ENCODER_EXTRA_BITS 4
397

    
398
typedef struct x_and_coeff{
399
    int16_t x;
400
    uint16_t coeff;
401
} x_and_coeff;
402

    
403
typedef struct SubBand{
404
    int level;
405
    int stride;
406
    int width;
407
    int height;
408
    int qlog;                                   ///< log(qscale)/log[2^(1/6)]
409
    DWTELEM *buf;
410
    int buf_x_offset;
411
    int buf_y_offset;
412
    int stride_line; ///< Stride measured in lines, not pixels.
413
    x_and_coeff * x_coeff;
414
    struct SubBand *parent;
415
    uint8_t state[/*7*2*/ 7 + 512][32];
416
}SubBand;
417

    
418
typedef struct Plane{
419
    int width;
420
    int height;
421
    SubBand band[MAX_DECOMPOSITIONS][4];
422
}Plane;
423

    
424
typedef struct SnowContext{
425
//    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
426

    
427
    AVCodecContext *avctx;
428
    RangeCoder c;
429
    DSPContext dsp;
430
    AVFrame new_picture;
431
    AVFrame input_picture;              ///< new_picture with the internal linesizes
432
    AVFrame current_picture;
433
    AVFrame last_picture[MAX_REF_FRAMES];
434
    AVFrame mconly_picture;
435
//     uint8_t q_context[16];
436
    uint8_t header_state[32];
437
    uint8_t block_state[128 + 32*128];
438
    int keyframe;
439
    int always_reset;
440
    int version;
441
    int spatial_decomposition_type;
442
    int last_spatial_decomposition_type;
443
    int temporal_decomposition_type;
444
    int spatial_decomposition_count;
445
    int temporal_decomposition_count;
446
    int max_ref_frames;
447
    int ref_frames;
448
    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
449
    uint32_t *ref_scores[MAX_REF_FRAMES];
450
    DWTELEM *spatial_dwt_buffer;
451
    int colorspace_type;
452
    int chroma_h_shift;
453
    int chroma_v_shift;
454
    int spatial_scalability;
455
    int qlog;
456
    int last_qlog;
457
    int lambda;
458
    int lambda2;
459
    int pass1_rc;
460
    int mv_scale;
461
    int last_mv_scale;
462
    int qbias;
463
    int last_qbias;
464
#define QBIAS_SHIFT 3
465
    int b_width;
466
    int b_height;
467
    int block_max_depth;
468
    int last_block_max_depth;
469
    Plane plane[MAX_PLANES];
470
    BlockNode *block;
471
#define ME_CACHE_SIZE 1024
472
    int me_cache[ME_CACHE_SIZE];
473
    int me_cache_generation;
474
    slice_buffer sb;
475

    
476
    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
477
}SnowContext;
478

    
479
typedef struct {
480
    DWTELEM *b0;
481
    DWTELEM *b1;
482
    DWTELEM *b2;
483
    DWTELEM *b3;
484
    int y;
485
} dwt_compose_t;
486

    
487
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
488
//#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
489

    
490
static void iterative_me(SnowContext *s);
491

    
492
static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
493
{
494
    int i;
495

    
496
    buf->base_buffer = base_buffer;
497
    buf->line_count = line_count;
498
    buf->line_width = line_width;
499
    buf->data_count = max_allocated_lines;
500
    buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
501
    buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
502

    
503
    for (i = 0; i < max_allocated_lines; i++)
504
    {
505
      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
506
    }
507

    
508
    buf->data_stack_top = max_allocated_lines - 1;
509
}
510

    
511
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
512
{
513
    int offset;
514
    DWTELEM * buffer;
515

    
516
//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
517

    
518
    assert(buf->data_stack_top >= 0);
519
//  assert(!buf->line[line]);
520
    if (buf->line[line])
521
        return buf->line[line];
522

    
523
    offset = buf->line_width * line;
524
    buffer = buf->data_stack[buf->data_stack_top];
525
    buf->data_stack_top--;
526
    buf->line[line] = buffer;
527

    
528
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
529

    
530
    return buffer;
531
}
532

    
533
static void slice_buffer_release(slice_buffer * buf, int line)
534
{
535
    int offset;
536
    DWTELEM * buffer;
537

    
538
    assert(line >= 0 && line < buf->line_count);
539
    assert(buf->line[line]);
540

    
541
    offset = buf->line_width * line;
542
    buffer = buf->line[line];
543
    buf->data_stack_top++;
544
    buf->data_stack[buf->data_stack_top] = buffer;
545
    buf->line[line] = NULL;
546

    
547
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
548
}
549

    
550
static void slice_buffer_flush(slice_buffer * buf)
551
{
552
    int i;
553
    for (i = 0; i < buf->line_count; i++)
554
    {
555
        if (buf->line[i])
556
        {
557
//      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
558
            slice_buffer_release(buf, i);
559
        }
560
    }
561
}
562

    
563
static void slice_buffer_destroy(slice_buffer * buf)
564
{
565
    int i;
566
    slice_buffer_flush(buf);
567

    
568
    for (i = buf->data_count - 1; i >= 0; i--)
569
    {
570
        assert(buf->data_stack[i]);
571
        av_freep(&buf->data_stack[i]);
572
    }
573
    assert(buf->data_stack);
574
    av_freep(&buf->data_stack);
575
    assert(buf->line);
576
    av_freep(&buf->line);
577
}
578

    
579
#ifdef __sgi
580
// Avoid a name clash on SGI IRIX
581
#undef qexp
582
#endif
583
#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
584
static uint8_t qexp[QROOT];
585

    
586
static inline int mirror(int v, int m){
587
    while((unsigned)v > (unsigned)m){
588
        v=-v;
589
        if(v<0) v+= 2*m;
590
    }
591
    return v;
592
}
593

    
594
static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
595
    int i;
596

    
597
    if(v){
598
        const int a= FFABS(v);
599
        const int e= av_log2(a);
600
#if 1
601
        const int el= FFMIN(e, 10);
602
        put_rac(c, state+0, 0);
603

    
604
        for(i=0; i<el; i++){
605
            put_rac(c, state+1+i, 1);  //1..10
606
        }
607
        for(; i<e; i++){
608
            put_rac(c, state+1+9, 1);  //1..10
609
        }
610
        put_rac(c, state+1+FFMIN(i,9), 0);
611

    
612
        for(i=e-1; i>=el; i--){
613
            put_rac(c, state+22+9, (a>>i)&1); //22..31
614
        }
615
        for(; i>=0; i--){
616
            put_rac(c, state+22+i, (a>>i)&1); //22..31
617
        }
618

    
619
        if(is_signed)
620
            put_rac(c, state+11 + el, v < 0); //11..21
621
#else
622

    
623
        put_rac(c, state+0, 0);
624
        if(e<=9){
625
            for(i=0; i<e; i++){
626
                put_rac(c, state+1+i, 1);  //1..10
627
            }
628
            put_rac(c, state+1+i, 0);
629

    
630
            for(i=e-1; i>=0; i--){
631
                put_rac(c, state+22+i, (a>>i)&1); //22..31
632
            }
633

    
634
            if(is_signed)
635
                put_rac(c, state+11 + e, v < 0); //11..21
636
        }else{
637
            for(i=0; i<e; i++){
638
                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
639
            }
640
            put_rac(c, state+1+FFMIN(i,9), 0);
641

    
642
            for(i=e-1; i>=0; i--){
643
                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
644
            }
645

    
646
            if(is_signed)
647
                put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
648
        }
649
#endif
650
    }else{
651
        put_rac(c, state+0, 1);
652
    }
653
}
654

    
655
static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
656
    if(get_rac(c, state+0))
657
        return 0;
658
    else{
659
        int i, e, a;
660
        e= 0;
661
        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
662
            e++;
663
        }
664

    
665
        a= 1;
666
        for(i=e-1; i>=0; i--){
667
            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
668
        }
669

    
670
        if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
671
            return -a;
672
        else
673
            return a;
674
    }
675
}
676

    
677
static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
678
    int i;
679
    int r= log2>=0 ? 1<<log2 : 1;
680

    
681
    assert(v>=0);
682
    assert(log2>=-4);
683

    
684
    while(v >= r){
685
        put_rac(c, state+4+log2, 1);
686
        v -= r;
687
        log2++;
688
        if(log2>0) r+=r;
689
    }
690
    put_rac(c, state+4+log2, 0);
691

    
692
    for(i=log2-1; i>=0; i--){
693
        put_rac(c, state+31-i, (v>>i)&1);
694
    }
695
}
696

    
697
static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
698
    int i;
699
    int r= log2>=0 ? 1<<log2 : 1;
700
    int v=0;
701

    
702
    assert(log2>=-4);
703

    
704
    while(get_rac(c, state+4+log2)){
705
        v+= r;
706
        log2++;
707
        if(log2>0) r+=r;
708
    }
709

    
710
    for(i=log2-1; i>=0; i--){
711
        v+= get_rac(c, state+31-i)<<i;
712
    }
713

    
714
    return v;
715
}
716

    
717
static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
718
    const int mirror_left= !highpass;
719
    const int mirror_right= (width&1) ^ highpass;
720
    const int w= (width>>1) - 1 + (highpass & width);
721
    int i;
722

    
723
#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
724
    if(mirror_left){
725
        dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
726
        dst += dst_step;
727
        src += src_step;
728
    }
729

    
730
    for(i=0; i<w; i++){
731
        dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
732
    }
733

    
734
    if(mirror_right){
735
        dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
736
    }
737
}
738

    
739
#ifndef lift5
740
static av_always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
741
    const int mirror_left= !highpass;
742
    const int mirror_right= (width&1) ^ highpass;
743
    const int w= (width>>1) - 1 + (highpass & width);
744
    int i;
745

    
746
    if(mirror_left){
747
        int r= 3*2*ref[0];
748
        r += r>>4;
749
        r += r>>8;
750
        dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
751
        dst += dst_step;
752
        src += src_step;
753
    }
754

    
755
    for(i=0; i<w; i++){
756
        int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
757
        r += r>>4;
758
        r += r>>8;
759
        dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
760
    }
761

    
762
    if(mirror_right){
763
        int r= 3*2*ref[w*ref_step];
764
        r += r>>4;
765
        r += r>>8;
766
        dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
767
    }
768
}
769
#endif
770

    
771
#ifndef liftS
772
static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
773
    const int mirror_left= !highpass;
774
    const int mirror_right= (width&1) ^ highpass;
775
    const int w= (width>>1) - 1 + (highpass & width);
776
    int i;
777

    
778
    assert(shift == 4);
779
#define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): -((-16*(src) + (ref) + add/4 + 1 + (5<<25))/(5*4) - (1<<23)))
780
    if(mirror_left){
781
        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
782
        dst += dst_step;
783
        src += src_step;
784
    }
785

    
786
    for(i=0; i<w; i++){
787
        dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
788
    }
789

    
790
    if(mirror_right){
791
        dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
792
    }
793
}
794
#endif
795

    
796

    
797
static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
798
    int x, i;
799

    
800
    for(x=start; x<width; x+=2){
801
        int64_t sum=0;
802

    
803
        for(i=0; i<n; i++){
804
            int x2= x + 2*i - n + 1;
805
            if     (x2<     0) x2= -x2;
806
            else if(x2>=width) x2= 2*width-x2-2;
807
            sum += coeffs[i]*(int64_t)dst[x2];
808
        }
809
        if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
810
        else        dst[x] += (sum + (1<<shift)/2)>>shift;
811
    }
812
}
813

    
814
static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
815
    int x, y, i;
816
    for(y=start; y<height; y+=2){
817
        for(x=0; x<width; x++){
818
            int64_t sum=0;
819

    
820
            for(i=0; i<n; i++){
821
                int y2= y + 2*i - n + 1;
822
                if     (y2<      0) y2= -y2;
823
                else if(y2>=height) y2= 2*height-y2-2;
824
                sum += coeffs[i]*(int64_t)dst[x + y2*stride];
825
            }
826
            if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
827
            else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
828
        }
829
    }
830
}
831

    
832
#define SCALEX 1
833
#define LX0 0
834
#define LX1 1
835

    
836
#if 0 // more accurate 9/7
837
#define N1 2
838
#define SHIFT1 14
839
#define COEFFS1 (int[]){-25987,-25987}
840
#define N2 2
841
#define SHIFT2 19
842
#define COEFFS2 (int[]){-27777,-27777}
843
#define N3 2
844
#define SHIFT3 15
845
#define COEFFS3 (int[]){28931,28931}
846
#define N4 2
847
#define SHIFT4 15
848
#define COEFFS4 (int[]){14533,14533}
849
#elif 1 // 13/7 CRF
850
#define N1 4
851
#define SHIFT1 4
852
#define COEFFS1 (int[]){1,-9,-9,1}
853
#define N2 4
854
#define SHIFT2 4
855
#define COEFFS2 (int[]){-1,5,5,-1}
856
#define N3 0
857
#define SHIFT3 1
858
#define COEFFS3 NULL
859
#define N4 0
860
#define SHIFT4 1
861
#define COEFFS4 NULL
862
#elif 1 // 3/5
863
#define LX0 1
864
#define LX1 0
865
#define SCALEX 0.5
866
#define N1 2
867
#define SHIFT1 1
868
#define COEFFS1 (int[]){1,1}
869
#define N2 2
870
#define SHIFT2 2
871
#define COEFFS2 (int[]){-1,-1}
872
#define N3 0
873
#define SHIFT3 0
874
#define COEFFS3 NULL
875
#define N4 0
876
#define SHIFT4 0
877
#define COEFFS4 NULL
878
#elif 1 // 11/5
879
#define N1 0
880
#define SHIFT1 1
881
#define COEFFS1 NULL
882
#define N2 2
883
#define SHIFT2 2
884
#define COEFFS2 (int[]){-1,-1}
885
#define N3 2
886
#define SHIFT3 0
887
#define COEFFS3 (int[]){-1,-1}
888
#define N4 4
889
#define SHIFT4 7
890
#define COEFFS4 (int[]){-5,29,29,-5}
891
#define SCALEX 4
892
#elif 1 // 9/7 CDF
893
#define N1 2
894
#define SHIFT1 7
895
#define COEFFS1 (int[]){-203,-203}
896
#define N2 2
897
#define SHIFT2 12
898
#define COEFFS2 (int[]){-217,-217}
899
#define N3 2
900
#define SHIFT3 7
901
#define COEFFS3 (int[]){113,113}
902
#define N4 2
903
#define SHIFT4 9
904
#define COEFFS4 (int[]){227,227}
905
#define SCALEX 1
906
#elif 1 // 7/5 CDF
907
#define N1 0
908
#define SHIFT1 1
909
#define COEFFS1 NULL
910
#define N2 2
911
#define SHIFT2 2
912
#define COEFFS2 (int[]){-1,-1}
913
#define N3 2
914
#define SHIFT3 0
915
#define COEFFS3 (int[]){-1,-1}
916
#define N4 2
917
#define SHIFT4 4
918
#define COEFFS4 (int[]){3,3}
919
#elif 1 // 9/7 MN
920
#define N1 4
921
#define SHIFT1 4
922
#define COEFFS1 (int[]){1,-9,-9,1}
923
#define N2 2
924
#define SHIFT2 2
925
#define COEFFS2 (int[]){1,1}
926
#define N3 0
927
#define SHIFT3 1
928
#define COEFFS3 NULL
929
#define N4 0
930
#define SHIFT4 1
931
#define COEFFS4 NULL
932
#else // 13/7 CRF
933
#define N1 4
934
#define SHIFT1 4
935
#define COEFFS1 (int[]){1,-9,-9,1}
936
#define N2 4
937
#define SHIFT2 4
938
#define COEFFS2 (int[]){-1,5,5,-1}
939
#define N3 0
940
#define SHIFT3 1
941
#define COEFFS3 NULL
942
#define N4 0
943
#define SHIFT4 1
944
#define COEFFS4 NULL
945
#endif
946
static void horizontal_decomposeX(DWTELEM *b, int width){
947
    DWTELEM temp[width];
948
    const int width2= width>>1;
949
    const int w2= (width+1)>>1;
950
    int x;
951

    
952
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
953
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
954
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
955
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
956

    
957
    for(x=0; x<width2; x++){
958
        temp[x   ]= b[2*x    ];
959
        temp[x+w2]= b[2*x + 1];
960
    }
961
    if(width&1)
962
        temp[x   ]= b[2*x    ];
963
    memcpy(b, temp, width*sizeof(int));
964
}
965

    
966
static void horizontal_composeX(DWTELEM *b, int width){
967
    DWTELEM temp[width];
968
    const int width2= width>>1;
969
    int x;
970
    const int w2= (width+1)>>1;
971

    
972
    memcpy(temp, b, width*sizeof(int));
973
    for(x=0; x<width2; x++){
974
        b[2*x    ]= temp[x   ];
975
        b[2*x + 1]= temp[x+w2];
976
    }
977
    if(width&1)
978
        b[2*x    ]= temp[x   ];
979

    
980
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
981
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
982
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
983
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
984
}
985

    
986
static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
987
    int x, y;
988

    
989
    for(y=0; y<height; y++){
990
        for(x=0; x<width; x++){
991
            buffer[y*stride + x] *= SCALEX;
992
        }
993
    }
994

    
995
    for(y=0; y<height; y++){
996
        horizontal_decomposeX(buffer + y*stride, width);
997
    }
998

    
999
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
1000
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
1001
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
1002
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
1003
}
1004

    
1005
static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
1006
    int x, y;
1007

    
1008
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1009
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1010
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1011
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1012

    
1013
    for(y=0; y<height; y++){
1014
        horizontal_composeX(buffer + y*stride, width);
1015
    }
1016

    
1017
    for(y=0; y<height; y++){
1018
        for(x=0; x<width; x++){
1019
            buffer[y*stride + x] /= SCALEX;
1020
        }
1021
    }
1022
}
1023

    
1024
static void horizontal_decompose53i(DWTELEM *b, int width){
1025
    DWTELEM temp[width];
1026
    const int width2= width>>1;
1027
    int x;
1028
    const int w2= (width+1)>>1;
1029

    
1030
    for(x=0; x<width2; x++){
1031
        temp[x   ]= b[2*x    ];
1032
        temp[x+w2]= b[2*x + 1];
1033
    }
1034
    if(width&1)
1035
        temp[x   ]= b[2*x    ];
1036
#if 0
1037
    {
1038
    int A1,A2,A3,A4;
1039
    A2= temp[1       ];
1040
    A4= temp[0       ];
1041
    A1= temp[0+width2];
1042
    A1 -= (A2 + A4)>>1;
1043
    A4 += (A1 + 1)>>1;
1044
    b[0+width2] = A1;
1045
    b[0       ] = A4;
1046
    for(x=1; x+1<width2; x+=2){
1047
        A3= temp[x+width2];
1048
        A4= temp[x+1     ];
1049
        A3 -= (A2 + A4)>>1;
1050
        A2 += (A1 + A3 + 2)>>2;
1051
        b[x+width2] = A3;
1052
        b[x       ] = A2;
1053

1054
        A1= temp[x+1+width2];
1055
        A2= temp[x+2       ];
1056
        A1 -= (A2 + A4)>>1;
1057
        A4 += (A1 + A3 + 2)>>2;
1058
        b[x+1+width2] = A1;
1059
        b[x+1       ] = A4;
1060
    }
1061
    A3= temp[width-1];
1062
    A3 -= A2;
1063
    A2 += (A1 + A3 + 2)>>2;
1064
    b[width -1] = A3;
1065
    b[width2-1] = A2;
1066
    }
1067
#else
1068
    lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1069
    lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1070
#endif
1071
}
1072

    
1073
static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1074
    int i;
1075

    
1076
    for(i=0; i<width; i++){
1077
        b1[i] -= (b0[i] + b2[i])>>1;
1078
    }
1079
}
1080

    
1081
static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1082
    int i;
1083

    
1084
    for(i=0; i<width; i++){
1085
        b1[i] += (b0[i] + b2[i] + 2)>>2;
1086
    }
1087
}
1088

    
1089
static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1090
    int y;
1091
    DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1092
    DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1093

    
1094
    for(y=-2; y<height; y+=2){
1095
        DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1096
        DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1097

    
1098
{START_TIMER
1099
        if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1100
        if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1101
STOP_TIMER("horizontal_decompose53i")}
1102

    
1103
{START_TIMER
1104
        if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1105
        if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1106
STOP_TIMER("vertical_decompose53i*")}
1107

    
1108
        b0=b2;
1109
        b1=b3;
1110
    }
1111
}
1112

    
1113
static void horizontal_decompose97i(DWTELEM *b, int width){
1114
    DWTELEM temp[width];
1115
    const int w2= (width+1)>>1;
1116

    
1117
    lift (temp+w2, b    +1, b      , 1, 2, 2, width,  W_AM, W_AO, W_AS, 1, 1);
1118
    liftS(temp   , b      , temp+w2, 1, 2, 1, width,  W_BM, W_BO, W_BS, 0, 0);
1119
    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1120
    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1121
}
1122

    
1123

    
1124
static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1125
    int i;
1126

    
1127
    for(i=0; i<width; i++){
1128
        b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1129
    }
1130
}
1131

    
1132
static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1133
    int i;
1134

    
1135
    for(i=0; i<width; i++){
1136
#ifdef lift5
1137
        b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1138
#else
1139
        int r= 3*(b0[i] + b2[i]);
1140
        r+= r>>4;
1141
        r+= r>>8;
1142
        b1[i] += (r+W_CO)>>W_CS;
1143
#endif
1144
    }
1145
}
1146

    
1147
static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1148
    int i;
1149

    
1150
    for(i=0; i<width; i++){
1151
#ifdef liftS
1152
        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1153
#else
1154
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23);
1155
#endif
1156
    }
1157
}
1158

    
1159
static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1160
    int i;
1161

    
1162
    for(i=0; i<width; i++){
1163
        b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1164
    }
1165
}
1166

    
1167
static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1168
    int y;
1169
    DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1170
    DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1171
    DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1172
    DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1173

    
1174
    for(y=-4; y<height; y+=2){
1175
        DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1176
        DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1177

    
1178
{START_TIMER
1179
        if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1180
        if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1181
if(width>400){
1182
STOP_TIMER("horizontal_decompose97i")
1183
}}
1184

    
1185
{START_TIMER
1186
        if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1187
        if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1188
        if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1189
        if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1190

    
1191
if(width>400){
1192
STOP_TIMER("vertical_decompose97i")
1193
}}
1194

    
1195
        b0=b2;
1196
        b1=b3;
1197
        b2=b4;
1198
        b3=b5;
1199
    }
1200
}
1201

    
1202
void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1203
    int level;
1204

    
1205
    for(level=0; level<decomposition_count; level++){
1206
        switch(type){
1207
        case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1208
        case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1209
        case DWT_X: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1210
        }
1211
    }
1212
}
1213

    
1214
static void horizontal_compose53i(DWTELEM *b, int width){
1215
    DWTELEM temp[width];
1216
    const int width2= width>>1;
1217
    const int w2= (width+1)>>1;
1218
    int x;
1219

    
1220
#if 0
1221
    int A1,A2,A3,A4;
1222
    A2= temp[1       ];
1223
    A4= temp[0       ];
1224
    A1= temp[0+width2];
1225
    A1 -= (A2 + A4)>>1;
1226
    A4 += (A1 + 1)>>1;
1227
    b[0+width2] = A1;
1228
    b[0       ] = A4;
1229
    for(x=1; x+1<width2; x+=2){
1230
        A3= temp[x+width2];
1231
        A4= temp[x+1     ];
1232
        A3 -= (A2 + A4)>>1;
1233
        A2 += (A1 + A3 + 2)>>2;
1234
        b[x+width2] = A3;
1235
        b[x       ] = A2;
1236

1237
        A1= temp[x+1+width2];
1238
        A2= temp[x+2       ];
1239
        A1 -= (A2 + A4)>>1;
1240
        A4 += (A1 + A3 + 2)>>2;
1241
        b[x+1+width2] = A1;
1242
        b[x+1       ] = A4;
1243
    }
1244
    A3= temp[width-1];
1245
    A3 -= A2;
1246
    A2 += (A1 + A3 + 2)>>2;
1247
    b[width -1] = A3;
1248
    b[width2-1] = A2;
1249
#else
1250
    lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1251
    lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1252
#endif
1253
    for(x=0; x<width2; x++){
1254
        b[2*x    ]= temp[x   ];
1255
        b[2*x + 1]= temp[x+w2];
1256
    }
1257
    if(width&1)
1258
        b[2*x    ]= temp[x   ];
1259
}
1260

    
1261
static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1262
    int i;
1263

    
1264
    for(i=0; i<width; i++){
1265
        b1[i] += (b0[i] + b2[i])>>1;
1266
    }
1267
}
1268

    
1269
static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1270
    int i;
1271

    
1272
    for(i=0; i<width; i++){
1273
        b1[i] -= (b0[i] + b2[i] + 2)>>2;
1274
    }
1275
}
1276

    
1277
static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1278
    cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1279
    cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1280
    cs->y = -1;
1281
}
1282

    
1283
static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1284
    cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1285
    cs->b1 = buffer + mirror(-1  , height-1)*stride;
1286
    cs->y = -1;
1287
}
1288

    
1289
static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1290
    int y= cs->y;
1291

    
1292
    DWTELEM *b0= cs->b0;
1293
    DWTELEM *b1= cs->b1;
1294
    DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1295
    DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1296

    
1297
{START_TIMER
1298
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1299
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1300
STOP_TIMER("vertical_compose53i*")}
1301

    
1302
{START_TIMER
1303
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1304
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1305
STOP_TIMER("horizontal_compose53i")}
1306

    
1307
    cs->b0 = b2;
1308
    cs->b1 = b3;
1309
    cs->y += 2;
1310
}
1311

    
1312
static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1313
    int y= cs->y;
1314
    DWTELEM *b0= cs->b0;
1315
    DWTELEM *b1= cs->b1;
1316
    DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1317
    DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1318

    
1319
{START_TIMER
1320
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1321
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1322
STOP_TIMER("vertical_compose53i*")}
1323

    
1324
{START_TIMER
1325
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1326
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1327
STOP_TIMER("horizontal_compose53i")}
1328

    
1329
    cs->b0 = b2;
1330
    cs->b1 = b3;
1331
    cs->y += 2;
1332
}
1333

    
1334
static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1335
    dwt_compose_t cs;
1336
    spatial_compose53i_init(&cs, buffer, height, stride);
1337
    while(cs.y <= height)
1338
        spatial_compose53i_dy(&cs, buffer, width, height, stride);
1339
}
1340

    
1341

    
1342
void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1343
    DWTELEM temp[width];
1344
    const int w2= (width+1)>>1;
1345

    
1346
    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1347
    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1348
    liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO, W_BS, 0, 1);
1349
    lift (b+1    , temp+w2, b      , 2, 1, 2, width,  W_AM, W_AO, W_AS, 1, 0);
1350
}
1351

    
1352
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1353
    int i;
1354

    
1355
    for(i=0; i<width; i++){
1356
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1357
    }
1358
}
1359

    
1360
static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1361
    int i;
1362

    
1363
    for(i=0; i<width; i++){
1364
#ifdef lift5
1365
        b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1366
#else
1367
        int r= 3*(b0[i] + b2[i]);
1368
        r+= r>>4;
1369
        r+= r>>8;
1370
        b1[i] -= (r+W_CO)>>W_CS;
1371
#endif
1372
    }
1373
}
1374

    
1375
static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1376
    int i;
1377

    
1378
    for(i=0; i<width; i++){
1379
#ifdef liftS
1380
        b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1381
#else
1382
        b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1383
#endif
1384
    }
1385
}
1386

    
1387
static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1388
    int i;
1389

    
1390
    for(i=0; i<width; i++){
1391
        b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1392
    }
1393
}
1394

    
1395
void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1396
    int i;
1397

    
1398
    for(i=0; i<width; i++){
1399
#ifndef lift5
1400
        int r;
1401
#endif
1402
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1403
#ifdef lift5
1404
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1405
#else
1406
        r= 3*(b2[i] + b4[i]);
1407
        r+= r>>4;
1408
        r+= r>>8;
1409
        b3[i] -= (r+W_CO)>>W_CS;
1410
#endif
1411
#ifdef liftS
1412
        b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1413
#else
1414
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1415
#endif
1416
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1417
    }
1418
}
1419

    
1420
static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1421
    cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1422
    cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1423
    cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1424
    cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1425
    cs->y = -3;
1426
}
1427

    
1428
static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1429
    cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1430
    cs->b1 = buffer + mirror(-3  , height-1)*stride;
1431
    cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1432
    cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1433
    cs->y = -3;
1434
}
1435

    
1436
static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1437
    int y = cs->y;
1438

    
1439
    DWTELEM *b0= cs->b0;
1440
    DWTELEM *b1= cs->b1;
1441
    DWTELEM *b2= cs->b2;
1442
    DWTELEM *b3= cs->b3;
1443
    DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1444
    DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1445

    
1446
{START_TIMER
1447
    if(y>0 && y+4<height){
1448
        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1449
    }else{
1450
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1451
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1452
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1453
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1454
    }
1455
if(width>400){
1456
STOP_TIMER("vertical_compose97i")}}
1457

    
1458
{START_TIMER
1459
        if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1460
        if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1461
if(width>400 && y+0<(unsigned)height){
1462
STOP_TIMER("horizontal_compose97i")}}
1463

    
1464
    cs->b0=b2;
1465
    cs->b1=b3;
1466
    cs->b2=b4;
1467
    cs->b3=b5;
1468
    cs->y += 2;
1469
}
1470

    
1471
static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1472
    int y = cs->y;
1473
    DWTELEM *b0= cs->b0;
1474
    DWTELEM *b1= cs->b1;
1475
    DWTELEM *b2= cs->b2;
1476
    DWTELEM *b3= cs->b3;
1477
    DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1478
    DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1479

    
1480
{START_TIMER
1481
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1482
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1483
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1484
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1485
if(width>400){
1486
STOP_TIMER("vertical_compose97i")}}
1487

    
1488
{START_TIMER
1489
        if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1490
        if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1491
if(width>400 && b0 <= b2){
1492
STOP_TIMER("horizontal_compose97i")}}
1493

    
1494
    cs->b0=b2;
1495
    cs->b1=b3;
1496
    cs->b2=b4;
1497
    cs->b3=b5;
1498
    cs->y += 2;
1499
}
1500

    
1501
static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1502
    dwt_compose_t cs;
1503
    spatial_compose97i_init(&cs, buffer, height, stride);
1504
    while(cs.y <= height)
1505
        spatial_compose97i_dy(&cs, buffer, width, height, stride);
1506
}
1507

    
1508
static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1509
    int level;
1510
    for(level=decomposition_count-1; level>=0; level--){
1511
        switch(type){
1512
        case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1513
        case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1514
        /* not slicified yet */
1515
        case DWT_X: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1516
          av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1517
        }
1518
    }
1519
}
1520

    
1521
static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1522
    int level;
1523
    for(level=decomposition_count-1; level>=0; level--){
1524
        switch(type){
1525
        case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1526
        case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1527
        /* not slicified yet */
1528
        case DWT_X: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1529
        }
1530
    }
1531
}
1532

    
1533
static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1534
    const int support = type==1 ? 3 : 5;
1535
    int level;
1536
    if(type==2) return;
1537

    
1538
    for(level=decomposition_count-1; level>=0; level--){
1539
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1540
            switch(type){
1541
            case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1542
                    break;
1543
            case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1544
                    break;
1545
            case DWT_X: break;
1546
            }
1547
        }
1548
    }
1549
}
1550

    
1551
static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1552
    const int support = type==1 ? 3 : 5;
1553
    int level;
1554
    if(type==2) return;
1555

    
1556
    for(level=decomposition_count-1; level>=0; level--){
1557
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1558
            switch(type){
1559
            case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1560
                    break;
1561
            case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1562
                    break;
1563
            case DWT_X: break;
1564
            }
1565
        }
1566
    }
1567
}
1568

    
1569
static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1570
    if(type==2){
1571
        int level;
1572
        for(level=decomposition_count-1; level>=0; level--)
1573
            spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1574
    }else{
1575
        dwt_compose_t cs[MAX_DECOMPOSITIONS];
1576
        int y;
1577
        ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1578
        for(y=0; y<height; y+=4)
1579
            ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1580
    }
1581
}
1582

    
1583
static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1584
    const int w= b->width;
1585
    const int h= b->height;
1586
    int x, y;
1587

    
1588
    if(1){
1589
        int run=0;
1590
        int runs[w*h];
1591
        int run_index=0;
1592
        int max_index;
1593

    
1594
        for(y=0; y<h; y++){
1595
            for(x=0; x<w; x++){
1596
                int v, p=0;
1597
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1598
                v= src[x + y*stride];
1599

    
1600
                if(y){
1601
                    t= src[x + (y-1)*stride];
1602
                    if(x){
1603
                        lt= src[x - 1 + (y-1)*stride];
1604
                    }
1605
                    if(x + 1 < w){
1606
                        rt= src[x + 1 + (y-1)*stride];
1607
                    }
1608
                }
1609
                if(x){
1610
                    l= src[x - 1 + y*stride];
1611
                    /*if(x > 1){
1612
                        if(orientation==1) ll= src[y + (x-2)*stride];
1613
                        else               ll= src[x - 2 + y*stride];
1614
                    }*/
1615
                }
1616
                if(parent){
1617
                    int px= x>>1;
1618
                    int py= y>>1;
1619
                    if(px<b->parent->width && py<b->parent->height)
1620
                        p= parent[px + py*2*stride];
1621
                }
1622
                if(!(/*ll|*/l|lt|t|rt|p)){
1623
                    if(v){
1624
                        runs[run_index++]= run;
1625
                        run=0;
1626
                    }else{
1627
                        run++;
1628
                    }
1629
                }
1630
            }
1631
        }
1632
        max_index= run_index;
1633
        runs[run_index++]= run;
1634
        run_index=0;
1635
        run= runs[run_index++];
1636

    
1637
        put_symbol2(&s->c, b->state[30], max_index, 0);
1638
        if(run_index <= max_index)
1639
            put_symbol2(&s->c, b->state[1], run, 3);
1640

    
1641
        for(y=0; y<h; y++){
1642
            if(s->c.bytestream_end - s->c.bytestream < w*40){
1643
                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1644
                return -1;
1645
            }
1646
            for(x=0; x<w; x++){
1647
                int v, p=0;
1648
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1649
                v= src[x + y*stride];
1650

    
1651
                if(y){
1652
                    t= src[x + (y-1)*stride];
1653
                    if(x){
1654
                        lt= src[x - 1 + (y-1)*stride];
1655
                    }
1656
                    if(x + 1 < w){
1657
                        rt= src[x + 1 + (y-1)*stride];
1658
                    }
1659
                }
1660
                if(x){
1661
                    l= src[x - 1 + y*stride];
1662
                    /*if(x > 1){
1663
                        if(orientation==1) ll= src[y + (x-2)*stride];
1664
                        else               ll= src[x - 2 + y*stride];
1665
                    }*/
1666
                }
1667
                if(parent){
1668
                    int px= x>>1;
1669
                    int py= y>>1;
1670
                    if(px<b->parent->width && py<b->parent->height)
1671
                        p= parent[px + py*2*stride];
1672
                }
1673
                if(/*ll|*/l|lt|t|rt|p){
1674
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1675

    
1676
                    put_rac(&s->c, &b->state[0][context], !!v);
1677
                }else{
1678
                    if(!run){
1679
                        run= runs[run_index++];
1680

    
1681
                        if(run_index <= max_index)
1682
                            put_symbol2(&s->c, b->state[1], run, 3);
1683
                        assert(v);
1684
                    }else{
1685
                        run--;
1686
                        assert(!v);
1687
                    }
1688
                }
1689
                if(v){
1690
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1691
                    int l2= 2*FFABS(l) + (l<0);
1692
                    int t2= 2*FFABS(t) + (t<0);
1693

    
1694
                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1695
                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1696
                }
1697
            }
1698
        }
1699
    }
1700
    return 0;
1701
}
1702

    
1703
static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1704
//    encode_subband_qtree(s, b, src, parent, stride, orientation);
1705
//    encode_subband_z0run(s, b, src, parent, stride, orientation);
1706
    return encode_subband_c0run(s, b, src, parent, stride, orientation);
1707
//    encode_subband_dzr(s, b, src, parent, stride, orientation);
1708
}
1709

    
1710
static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1711
    const int w= b->width;
1712
    const int h= b->height;
1713
    int x,y;
1714

    
1715
    if(1){
1716
        int run, runs;
1717
        x_and_coeff *xc= b->x_coeff;
1718
        x_and_coeff *prev_xc= NULL;
1719
        x_and_coeff *prev2_xc= xc;
1720
        x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1721
        x_and_coeff *prev_parent_xc= parent_xc;
1722

    
1723
        runs= get_symbol2(&s->c, b->state[30], 0);
1724
        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1725
        else           run= INT_MAX;
1726

    
1727
        for(y=0; y<h; y++){
1728
            int v=0;
1729
            int lt=0, t=0, rt=0;
1730

    
1731
            if(y && prev_xc->x == 0){
1732
                rt= prev_xc->coeff;
1733
            }
1734
            for(x=0; x<w; x++){
1735
                int p=0;
1736
                const int l= v;
1737

    
1738
                lt= t; t= rt;
1739

    
1740
                if(y){
1741
                    if(prev_xc->x <= x)
1742
                        prev_xc++;
1743
                    if(prev_xc->x == x + 1)
1744
                        rt= prev_xc->coeff;
1745
                    else
1746
                        rt=0;
1747
                }
1748
                if(parent_xc){
1749
                    if(x>>1 > parent_xc->x){
1750
                        parent_xc++;
1751
                    }
1752
                    if(x>>1 == parent_xc->x){
1753
                        p= parent_xc->coeff;
1754
                    }
1755
                }
1756
                if(/*ll|*/l|lt|t|rt|p){
1757
                    int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1758

    
1759
                    v=get_rac(&s->c, &b->state[0][context]);
1760
                    if(v){
1761
                        v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1762
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1763

    
1764
                        xc->x=x;
1765
                        (xc++)->coeff= v;
1766
                    }
1767
                }else{
1768
                    if(!run){
1769
                        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1770
                        else           run= INT_MAX;
1771
                        v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1772
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1773

    
1774
                        xc->x=x;
1775
                        (xc++)->coeff= v;
1776
                    }else{
1777
                        int max_run;
1778
                        run--;
1779
                        v=0;
1780

    
1781
                        if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1782
                        else  max_run= FFMIN(run, w-x-1);
1783
                        if(parent_xc)
1784
                            max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1785
                        x+= max_run;
1786
                        run-= max_run;
1787
                    }
1788
                }
1789
            }
1790
            (xc++)->x= w+1; //end marker
1791
            prev_xc= prev2_xc;
1792
            prev2_xc= xc;
1793

    
1794
            if(parent_xc){
1795
                if(y&1){
1796
                    while(parent_xc->x != parent->width+1)
1797
                        parent_xc++;
1798
                    parent_xc++;
1799
                    prev_parent_xc= parent_xc;
1800
                }else{
1801
                    parent_xc= prev_parent_xc;
1802
                }
1803
            }
1804
        }
1805

    
1806
        (xc++)->x= w+1; //end marker
1807
    }
1808
}
1809

    
1810
static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1811
    const int w= b->width;
1812
    int y;
1813
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1814
    int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1815
    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1816
    int new_index = 0;
1817

    
1818
    START_TIMER
1819

    
1820
    if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1821
        qadd= 0;
1822
        qmul= 1<<QEXPSHIFT;
1823
    }
1824

    
1825
    /* If we are on the second or later slice, restore our index. */
1826
    if (start_y != 0)
1827
        new_index = save_state[0];
1828

    
1829

    
1830
    for(y=start_y; y<h; y++){
1831
        int x = 0;
1832
        int v;
1833
        DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1834
        memset(line, 0, b->width*sizeof(DWTELEM));
1835
        v = b->x_coeff[new_index].coeff;
1836
        x = b->x_coeff[new_index++].x;
1837
        while(x < w)
1838
        {
1839
            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1840
            register int u= -(v&1);
1841
            line[x] = (t^u) - u;
1842

    
1843
            v = b->x_coeff[new_index].coeff;
1844
            x = b->x_coeff[new_index++].x;
1845
        }
1846
    }
1847
    if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1848
        STOP_TIMER("decode_subband")
1849
    }
1850

    
1851
    /* Save our variables for the next slice. */
1852
    save_state[0] = new_index;
1853

    
1854
    return;
1855
}
1856

    
1857
static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1858
    int plane_index, level, orientation;
1859

    
1860
    for(plane_index=0; plane_index<3; plane_index++){
1861
        for(level=0; level<s->spatial_decomposition_count; level++){
1862
            for(orientation=level ? 1:0; orientation<4; orientation++){
1863
                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1864
            }
1865
        }
1866
    }
1867
    memset(s->header_state, MID_STATE, sizeof(s->header_state));
1868
    memset(s->block_state, MID_STATE, sizeof(s->block_state));
1869
}
1870

    
1871
static int alloc_blocks(SnowContext *s){
1872
    int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1873
    int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1874

    
1875
    s->b_width = w;
1876
    s->b_height= h;
1877

    
1878
    s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1879
    return 0;
1880
}
1881

    
1882
static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1883
    uint8_t *bytestream= d->bytestream;
1884
    uint8_t *bytestream_start= d->bytestream_start;
1885
    *d= *s;
1886
    d->bytestream= bytestream;
1887
    d->bytestream_start= bytestream_start;
1888
}
1889

    
1890
//near copy & paste from dsputil, FIXME
1891
static int pix_sum(uint8_t * pix, int line_size, int w)
1892
{
1893
    int s, i, j;
1894

    
1895
    s = 0;
1896
    for (i = 0; i < w; i++) {
1897
        for (j = 0; j < w; j++) {
1898
            s += pix[0];
1899
            pix ++;
1900
        }
1901
        pix += line_size - w;
1902
    }
1903
    return s;
1904
}
1905

    
1906
//near copy & paste from dsputil, FIXME
1907
static int pix_norm1(uint8_t * pix, int line_size, int w)
1908
{
1909
    int s, i, j;
1910
    uint32_t *sq = ff_squareTbl + 256;
1911

    
1912
    s = 0;
1913
    for (i = 0; i < w; i++) {
1914
        for (j = 0; j < w; j ++) {
1915
            s += sq[pix[0]];
1916
            pix ++;
1917
        }
1918
        pix += line_size - w;
1919
    }
1920
    return s;
1921
}
1922

    
1923
static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1924
    const int w= s->b_width << s->block_max_depth;
1925
    const int rem_depth= s->block_max_depth - level;
1926
    const int index= (x + y*w) << rem_depth;
1927
    const int block_w= 1<<rem_depth;
1928
    BlockNode block;
1929
    int i,j;
1930

    
1931
    block.color[0]= l;
1932
    block.color[1]= cb;
1933
    block.color[2]= cr;
1934
    block.mx= mx;
1935
    block.my= my;
1936
    block.ref= ref;
1937
    block.type= type;
1938
    block.level= level;
1939

    
1940
    for(j=0; j<block_w; j++){
1941
        for(i=0; i<block_w; i++){
1942
            s->block[index + i + j*w]= block;
1943
        }
1944
    }
1945
}
1946

    
1947
static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1948
    const int offset[3]= {
1949
          y*c->  stride + x,
1950
        ((y*c->uvstride + x)>>1),
1951
        ((y*c->uvstride + x)>>1),
1952
    };
1953
    int i;
1954
    for(i=0; i<3; i++){
1955
        c->src[0][i]= src [i];
1956
        c->ref[0][i]= ref [i] + offset[i];
1957
    }
1958
    assert(!ref_index);
1959
}
1960

    
1961
static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1962
                           const BlockNode *left, const BlockNode *top, const BlockNode *tr){
1963
    if(s->ref_frames == 1){
1964
        *mx = mid_pred(left->mx, top->mx, tr->mx);
1965
        *my = mid_pred(left->my, top->my, tr->my);
1966
    }else{
1967
        const int *scale = scale_mv_ref[ref];
1968
        *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
1969
                       (top ->mx * scale[top ->ref] + 128) >>8,
1970
                       (tr  ->mx * scale[tr  ->ref] + 128) >>8);
1971
        *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
1972
                       (top ->my * scale[top ->ref] + 128) >>8,
1973
                       (tr  ->my * scale[tr  ->ref] + 128) >>8);
1974
    }
1975
}
1976

    
1977
//FIXME copy&paste
1978
#define P_LEFT P[1]
1979
#define P_TOP P[2]
1980
#define P_TOPRIGHT P[3]
1981
#define P_MEDIAN P[4]
1982
#define P_MV1 P[9]
1983
#define FLAG_QPEL   1 //must be 1
1984

    
1985
static int encode_q_branch(SnowContext *s, int level, int x, int y){
1986
    uint8_t p_buffer[1024];
1987
    uint8_t i_buffer[1024];
1988
    uint8_t p_state[sizeof(s->block_state)];
1989
    uint8_t i_state[sizeof(s->block_state)];
1990
    RangeCoder pc, ic;
1991
    uint8_t *pbbak= s->c.bytestream;
1992
    uint8_t *pbbak_start= s->c.bytestream_start;
1993
    int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
1994
    const int w= s->b_width  << s->block_max_depth;
1995
    const int h= s->b_height << s->block_max_depth;
1996
    const int rem_depth= s->block_max_depth - level;
1997
    const int index= (x + y*w) << rem_depth;
1998
    const int block_w= 1<<(LOG2_MB_SIZE - level);
1999
    int trx= (x+1)<<rem_depth;
2000
    int try= (y+1)<<rem_depth;
2001
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2002
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2003
    const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
2004
    const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2005
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2006
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2007
    int pl = left->color[0];
2008
    int pcb= left->color[1];
2009
    int pcr= left->color[2];
2010
    int pmx, pmy;
2011
    int mx=0, my=0;
2012
    int l,cr,cb;
2013
    const int stride= s->current_picture.linesize[0];
2014
    const int uvstride= s->current_picture.linesize[1];
2015
    uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2016
                                s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2017
                                s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2018
    int P[10][2];
2019
    int16_t last_mv[3][2];
2020
    int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2021
    const int shift= 1+qpel;
2022
    MotionEstContext *c= &s->m.me;
2023
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2024
    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2025
    int my_context= av_log2(2*FFABS(left->my - top->my));
2026
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2027
    int ref, best_ref, ref_score, ref_mx, ref_my;
2028

    
2029
    assert(sizeof(s->block_state) >= 256);
2030
    if(s->keyframe){
2031
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2032
        return 0;
2033
    }
2034

    
2035
//    clip predictors / edge ?
2036

    
2037
    P_LEFT[0]= left->mx;
2038
    P_LEFT[1]= left->my;
2039
    P_TOP [0]= top->mx;
2040
    P_TOP [1]= top->my;
2041
    P_TOPRIGHT[0]= tr->mx;
2042
    P_TOPRIGHT[1]= tr->my;
2043

    
2044
    last_mv[0][0]= s->block[index].mx;
2045
    last_mv[0][1]= s->block[index].my;
2046
    last_mv[1][0]= right->mx;
2047
    last_mv[1][1]= right->my;
2048
    last_mv[2][0]= bottom->mx;
2049
    last_mv[2][1]= bottom->my;
2050

    
2051
    s->m.mb_stride=2;
2052
    s->m.mb_x=
2053
    s->m.mb_y= 0;
2054
    c->skip= 0;
2055

    
2056
    assert(c->  stride ==   stride);
2057
    assert(c->uvstride == uvstride);
2058

    
2059
    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2060
    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2061
    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2062
    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2063

    
2064
    c->xmin = - x*block_w - 16+2;
2065
    c->ymin = - y*block_w - 16+2;
2066
    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2067
    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2068

    
2069
    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2070
    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2071
    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2072
    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2073
    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2074
    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2075
    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2076

    
2077
    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2078
    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2079

    
2080
    if (!y) {
2081
        c->pred_x= P_LEFT[0];
2082
        c->pred_y= P_LEFT[1];
2083
    } else {
2084
        c->pred_x = P_MEDIAN[0];
2085
        c->pred_y = P_MEDIAN[1];
2086
    }
2087

    
2088
    score= INT_MAX;
2089
    best_ref= 0;
2090
    for(ref=0; ref<s->ref_frames; ref++){
2091
        init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2092

    
2093
        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2094
                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2095

    
2096
        assert(ref_mx >= c->xmin);
2097
        assert(ref_mx <= c->xmax);
2098
        assert(ref_my >= c->ymin);
2099
        assert(ref_my <= c->ymax);
2100

    
2101
        ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2102
        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2103
        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2104
        if(s->ref_mvs[ref]){
2105
            s->ref_mvs[ref][index][0]= ref_mx;
2106
            s->ref_mvs[ref][index][1]= ref_my;
2107
            s->ref_scores[ref][index]= ref_score;
2108
        }
2109
        if(score > ref_score){
2110
            score= ref_score;
2111
            best_ref= ref;
2112
            mx= ref_mx;
2113
            my= ref_my;
2114
        }
2115
    }
2116
    //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
2117

    
2118
  //  subpel search
2119
    base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
2120
    pc= s->c;
2121
    pc.bytestream_start=
2122
    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2123
    memcpy(p_state, s->block_state, sizeof(s->block_state));
2124

    
2125
    if(level!=s->block_max_depth)
2126
        put_rac(&pc, &p_state[4 + s_context], 1);
2127
    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2128
    if(s->ref_frames > 1)
2129
        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2130
    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
2131
    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2132
    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2133
    p_len= pc.bytestream - pc.bytestream_start;
2134
    score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
2135

    
2136
    block_s= block_w*block_w;
2137
    sum = pix_sum(current_data[0], stride, block_w);
2138
    l= (sum + block_s/2)/block_s;
2139
    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2140

    
2141
    block_s= block_w*block_w>>2;
2142
    sum = pix_sum(current_data[1], uvstride, block_w>>1);
2143
    cb= (sum + block_s/2)/block_s;
2144
//    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2145
    sum = pix_sum(current_data[2], uvstride, block_w>>1);
2146
    cr= (sum + block_s/2)/block_s;
2147
//    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2148

    
2149
    ic= s->c;
2150
    ic.bytestream_start=
2151
    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2152
    memcpy(i_state, s->block_state, sizeof(s->block_state));
2153
    if(level!=s->block_max_depth)
2154
        put_rac(&ic, &i_state[4 + s_context], 1);
2155
    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2156
    put_symbol(&ic, &i_state[32],  l-pl , 1);
2157
    put_symbol(&ic, &i_state[64], cb-pcb, 1);
2158
    put_symbol(&ic, &i_state[96], cr-pcr, 1);
2159
    i_len= ic.bytestream - ic.bytestream_start;
2160
    iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
2161

    
2162
//    assert(score==256*256*256*64-1);
2163
    assert(iscore < 255*255*256 + s->lambda2*10);
2164
    assert(iscore >= 0);
2165
    assert(l>=0 && l<=255);
2166
    assert(pl>=0 && pl<=255);
2167

    
2168
    if(level==0){
2169
        int varc= iscore >> 8;
2170
        int vard= score >> 8;
2171
        if (vard <= 64 || vard < varc)
2172
            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2173
        else
2174
            c->scene_change_score+= s->m.qscale;
2175
    }
2176

    
2177
    if(level!=s->block_max_depth){
2178
        put_rac(&s->c, &s->block_state[4 + s_context], 0);
2179
        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2180
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2181
        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2182
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2183
        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2184

    
2185
        if(score2 < score && score2 < iscore)
2186
            return score2;
2187
    }
2188

    
2189
    if(iscore < score){
2190
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2191
        memcpy(pbbak, i_buffer, i_len);
2192
        s->c= ic;
2193
        s->c.bytestream_start= pbbak_start;
2194
        s->c.bytestream= pbbak + i_len;
2195
        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2196
        memcpy(s->block_state, i_state, sizeof(s->block_state));
2197
        return iscore;
2198
    }else{
2199
        memcpy(pbbak, p_buffer, p_len);
2200
        s->c= pc;
2201
        s->c.bytestream_start= pbbak_start;
2202
        s->c.bytestream= pbbak + p_len;
2203
        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2204
        memcpy(s->block_state, p_state, sizeof(s->block_state));
2205
        return score;
2206
    }
2207
}
2208

    
2209
static av_always_inline int same_block(BlockNode *a, BlockNode *b){
2210
    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2211
        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2212
    }else{
2213
        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2214
    }
2215
}
2216

    
2217
static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2218
    const int w= s->b_width  << s->block_max_depth;
2219
    const int rem_depth= s->block_max_depth - level;
2220
    const int index= (x + y*w) << rem_depth;
2221
    int trx= (x+1)<<rem_depth;
2222
    BlockNode *b= &s->block[index];
2223
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2224
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2225
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2226
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2227
    int pl = left->color[0];
2228
    int pcb= left->color[1];
2229
    int pcr= left->color[2];
2230
    int pmx, pmy;
2231
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2232
    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2233
    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2234
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2235

    
2236
    if(s->keyframe){
2237
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2238
        return;
2239
    }
2240

    
2241
    if(level!=s->block_max_depth){
2242
        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2243
            put_rac(&s->c, &s->block_state[4 + s_context], 1);
2244
        }else{
2245
            put_rac(&s->c, &s->block_state[4 + s_context], 0);
2246
            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2247
            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2248
            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2249
            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2250
            return;
2251
        }
2252
    }
2253
    if(b->type & BLOCK_INTRA){
2254
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2255
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2256
        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2257
        put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2258
        put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2259
        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2260
    }else{
2261
        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2262
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2263
        if(s->ref_frames > 1)
2264
            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2265
        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2266
        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2267
        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2268
    }
2269
}
2270

    
2271
static void decode_q_branch(SnowContext *s, int level, int x, int y){
2272
    const int w= s->b_width << s->block_max_depth;
2273
    const int rem_depth= s->block_max_depth - level;
2274
    const int index= (x + y*w) << rem_depth;
2275
    int trx= (x+1)<<rem_depth;
2276
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2277
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2278
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2279
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2280
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2281

    
2282
    if(s->keyframe){
2283
        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2284
        return;
2285
    }
2286

    
2287
    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2288
        int type, mx, my;
2289
        int l = left->color[0];
2290
        int cb= left->color[1];
2291
        int cr= left->color[2];
2292
        int ref = 0;
2293
        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2294
        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2295
        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2296

    
2297
        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2298

    
2299
        if(type){
2300
            pred_mv(s, &mx, &my, 0, left, top, tr);
2301
            l += get_symbol(&s->c, &s->block_state[32], 1);
2302
            cb+= get_symbol(&s->c, &s->block_state[64], 1);
2303
            cr+= get_symbol(&s->c, &s->block_state[96], 1);
2304
        }else{
2305
            if(s->ref_frames > 1)
2306
                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2307
            pred_mv(s, &mx, &my, ref, left, top, tr);
2308
            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2309
            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2310
        }
2311
        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2312
    }else{
2313
        decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2314
        decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2315
        decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2316
        decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2317
    }
2318
}
2319

    
2320
static void encode_blocks(SnowContext *s, int search){
2321
    int x, y;
2322
    int w= s->b_width;
2323
    int h= s->b_height;
2324

    
2325
    if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2326
        iterative_me(s);
2327

    
2328
    for(y=0; y<h; y++){
2329
        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2330
            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2331
            return;
2332
        }
2333
        for(x=0; x<w; x++){
2334
            if(s->avctx->me_method == ME_ITER || !search)
2335
                encode_q_branch2(s, 0, x, y);
2336
            else
2337
                encode_q_branch (s, 0, x, y);
2338
        }
2339
    }
2340
}
2341

    
2342
static void decode_blocks(SnowContext *s){
2343
    int x, y;
2344
    int w= s->b_width;
2345
    int h= s->b_height;
2346

    
2347
    for(y=0; y<h; y++){
2348
        for(x=0; x<w; x++){
2349
            decode_q_branch(s, 0, x, y);
2350
        }
2351
    }
2352
}
2353

    
2354
static void mc_block(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2355
    int x, y;
2356
START_TIMER
2357
    for(y=0; y < b_h+5; y++){
2358
        for(x=0; x < b_w; x++){
2359
            int a0= src[x    ];
2360
            int a1= src[x + 1];
2361
            int a2= src[x + 2];
2362
            int a3= src[x + 3];
2363
            int a4= src[x + 4];
2364
            int a5= src[x + 5];
2365
//            int am= 9*(a1+a2) - (a0+a3);
2366
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2367
//            int am= 18*(a2+a3) - 2*(a1+a4);
2368
//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2369
//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2370

    
2371
//            if(b_w==16) am= 8*(a1+a2);
2372

    
2373
            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2374
            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2375

    
2376
            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2377
            if(am&(~255)) am= ~(am>>31);
2378

    
2379
            tmp[x] = am;
2380

    
2381
/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2382
            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2383
            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2384
            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2385
        }
2386
        tmp += stride;
2387
        src += stride;
2388
    }
2389
    tmp -= (b_h+5)*stride;
2390

    
2391
    for(y=0; y < b_h; y++){
2392
        for(x=0; x < b_w; x++){
2393
            int a0= tmp[x + 0*stride];
2394
            int a1= tmp[x + 1*stride];
2395
            int a2= tmp[x + 2*stride];
2396
            int a3= tmp[x + 3*stride];
2397
            int a4= tmp[x + 4*stride];
2398
            int a5= tmp[x + 5*stride];
2399
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2400
//            int am= 18*(a2+a3) - 2*(a1+a4);
2401
/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2402
            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2403

    
2404
//            if(b_w==16) am= 8*(a1+a2);
2405

    
2406
            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2407
            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2408

    
2409
            if(am&(~255)) am= ~(am>>31);
2410

    
2411
            dst[x] = am;
2412
/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2413
            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2414
            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2415
            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2416
        }
2417
        dst += stride;
2418
        tmp += stride;
2419
    }
2420
STOP_TIMER("mc_block")
2421
}
2422

    
2423
#define mca(dx,dy,b_w)\
2424
static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, int stride, int h){\
2425
    uint8_t tmp[stride*(b_w+5)];\
2426
    assert(h==b_w);\
2427
    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2428
}
2429

    
2430
mca( 0, 0,16)
2431
mca( 8, 0,16)
2432
mca( 0, 8,16)
2433
mca( 8, 8,16)
2434
mca( 0, 0,8)
2435
mca( 8, 0,8)
2436
mca( 0, 8,8)
2437
mca( 8, 8,8)
2438

    
2439
static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2440
    if(block->type & BLOCK_INTRA){
2441
        int x, y;
2442
        const int color = block->color[plane_index];
2443
        const int color4= color*0x01010101;
2444
        if(b_w==32){
2445
            for(y=0; y < b_h; y++){
2446
                *(uint32_t*)&dst[0 + y*stride]= color4;
2447
                *(uint32_t*)&dst[4 + y*stride]= color4;
2448
                *(uint32_t*)&dst[8 + y*stride]= color4;
2449
                *(uint32_t*)&dst[12+ y*stride]= color4;
2450
                *(uint32_t*)&dst[16+ y*stride]= color4;
2451
                *(uint32_t*)&dst[20+ y*stride]= color4;
2452
                *(uint32_t*)&dst[24+ y*stride]= color4;
2453
                *(uint32_t*)&dst[28+ y*stride]= color4;
2454
            }
2455
        }else if(b_w==16){
2456
            for(y=0; y < b_h; y++){
2457
                *(uint32_t*)&dst[0 + y*stride]= color4;
2458
                *(uint32_t*)&dst[4 + y*stride]= color4;
2459
                *(uint32_t*)&dst[8 + y*stride]= color4;
2460
                *(uint32_t*)&dst[12+ y*stride]= color4;
2461
            }
2462
        }else if(b_w==8){
2463
            for(y=0; y < b_h; y++){
2464
                *(uint32_t*)&dst[0 + y*stride]= color4;
2465
                *(uint32_t*)&dst[4 + y*stride]= color4;
2466
            }
2467
        }else if(b_w==4){
2468
            for(y=0; y < b_h; y++){
2469
                *(uint32_t*)&dst[0 + y*stride]= color4;
2470
            }
2471
        }else{
2472
            for(y=0; y < b_h; y++){
2473
                for(x=0; x < b_w; x++){
2474
                    dst[x + y*stride]= color;
2475
                }
2476
            }
2477
        }
2478
    }else{
2479
        uint8_t *src= s->last_picture[block->ref].data[plane_index];
2480
        const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2481
        int mx= block->mx*scale;
2482
        int my= block->my*scale;
2483
        const int dx= mx&15;
2484
        const int dy= my&15;
2485
        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2486
        sx += (mx>>4) - 2;
2487
        sy += (my>>4) - 2;
2488
        src += sx + sy*stride;
2489
        if(   (unsigned)sx >= w - b_w - 4
2490
           || (unsigned)sy >= h - b_h - 4){
2491
            ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2492
            src= tmp + MB_SIZE;
2493
        }
2494
//        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2495
//        assert(!(b_w&(b_w-1)));
2496
        assert(b_w>1 && b_h>1);
2497
        assert(tab_index>=0 && tab_index<4 || b_w==32);
2498
        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2499
            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2500
        else if(b_w==32){
2501
            int y;
2502
            for(y=0; y<b_h; y+=16){
2503
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2504
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2505
            }
2506
        }else if(b_w==b_h)
2507
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2508
        else if(b_w==2*b_h){
2509
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2510
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2511
        }else{
2512
            assert(2*b_w==b_h);
2513
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2514
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2515
        }
2516
    }
2517
}
2518

    
2519
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2520
                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2521
    int y, x;
2522
    DWTELEM * dst;
2523
    for(y=0; y<b_h; y++){
2524
        //FIXME ugly misuse of obmc_stride
2525
        const uint8_t *obmc1= obmc + y*obmc_stride;
2526
        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2527
        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2528
        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2529
        dst = slice_buffer_get_line(sb, src_y + y);
2530
        for(x=0; x<b_w; x++){
2531
            int v=   obmc1[x] * block[3][x + y*src_stride]
2532
                    +obmc2[x] * block[2][x + y*src_stride]
2533
                    +obmc3[x] * block[1][x + y*src_stride]
2534
                    +obmc4[x] * block[0][x + y*src_stride];
2535

    
2536
            v <<= 8 - LOG2_OBMC_MAX;
2537
            if(FRAC_BITS != 8){
2538
                v += 1<<(7 - FRAC_BITS);
2539
                v >>= 8 - FRAC_BITS;
2540
            }
2541
            if(add){
2542
                v += dst[x + src_x];
2543
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2544
                if(v&(~255)) v= ~(v>>31);
2545
                dst8[x + y*src_stride] = v;
2546
            }else{
2547
                dst[x + src_x] -= v;
2548
            }
2549
        }
2550
    }
2551
}
2552

    
2553
//FIXME name clenup (b_w, block_w, b_width stuff)
2554
static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2555
    const int b_width = s->b_width  << s->block_max_depth;
2556
    const int b_height= s->b_height << s->block_max_depth;
2557
    const int b_stride= b_width;
2558
    BlockNode *lt= &s->block[b_x + b_y*b_stride];
2559
    BlockNode *rt= lt+1;
2560
    BlockNode *lb= lt+b_stride;
2561
    BlockNode *rb= lb+1;
2562
    uint8_t *block[4];
2563
    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2564
    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2565
    uint8_t *ptmp;
2566
    int x,y;
2567

    
2568
    if(b_x<0){
2569
        lt= rt;
2570
        lb= rb;
2571
    }else if(b_x + 1 >= b_width){
2572
        rt= lt;
2573
        rb= lb;
2574
    }
2575
    if(b_y<0){
2576
        lt= lb;
2577
        rt= rb;
2578
    }else if(b_y + 1 >= b_height){
2579
        lb= lt;
2580
        rb= rt;
2581
    }
2582

    
2583
    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2584
        obmc -= src_x;
2585
        b_w += src_x;
2586
        if(!sliced && !offset_dst)
2587
            dst -= src_x;
2588
        src_x=0;
2589
    }else if(src_x + b_w > w){
2590
        b_w = w - src_x;
2591
    }
2592
    if(src_y<0){
2593
        obmc -= src_y*obmc_stride;
2594
        b_h += src_y;
2595
        if(!sliced && !offset_dst)
2596
            dst -= src_y*dst_stride;
2597
        src_y=0;
2598
    }else if(src_y + b_h> h){
2599
        b_h = h - src_y;
2600
    }
2601

    
2602
    if(b_w<=0 || b_h<=0) return;
2603

    
2604
assert(src_stride > 2*MB_SIZE + 5);
2605
    if(!sliced && offset_dst)
2606
        dst += src_x + src_y*dst_stride;
2607
    dst8+= src_x + src_y*src_stride;
2608
//    src += src_x + src_y*src_stride;
2609

    
2610
    ptmp= tmp + 3*tmp_step;
2611
    block[0]= ptmp;
2612
    ptmp+=tmp_step;
2613
    pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2614

    
2615
    if(same_block(lt, rt)){
2616
        block[1]= block[0];
2617
    }else{
2618
        block[1]= ptmp;
2619
        ptmp+=tmp_step;
2620
        pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2621
    }
2622

    
2623
    if(same_block(lt, lb)){
2624
        block[2]= block[0];
2625
    }else if(same_block(rt, lb)){
2626
        block[2]= block[1];
2627
    }else{
2628
        block[2]= ptmp;
2629
        ptmp+=tmp_step;
2630
        pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2631
    }
2632

    
2633
    if(same_block(lt, rb) ){
2634
        block[3]= block[0];
2635
    }else if(same_block(rt, rb)){
2636
        block[3]= block[1];
2637
    }else if(same_block(lb, rb)){
2638
        block[3]= block[2];
2639
    }else{
2640
        block[3]= ptmp;
2641
        pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2642
    }
2643
#if 0
2644
    for(y=0; y<b_h; y++){
2645
        for(x=0; x<b_w; x++){
2646
            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2647
            if(add) dst[x + y*dst_stride] += v;
2648
            else    dst[x + y*dst_stride] -= v;
2649
        }
2650
    }
2651
    for(y=0; y<b_h; y++){
2652
        uint8_t *obmc2= obmc + (obmc_stride>>1);
2653
        for(x=0; x<b_w; x++){
2654
            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2655
            if(add) dst[x + y*dst_stride] += v;
2656
            else    dst[x + y*dst_stride] -= v;
2657
        }
2658
    }
2659
    for(y=0; y<b_h; y++){
2660
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2661
        for(x=0; x<b_w; x++){
2662
            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2663
            if(add) dst[x + y*dst_stride] += v;
2664
            else    dst[x + y*dst_stride] -= v;
2665
        }
2666
    }
2667
    for(y=0; y<b_h; y++){
2668
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2669
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2670
        for(x=0; x<b_w; x++){
2671
            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2672
            if(add) dst[x + y*dst_stride] += v;
2673
            else    dst[x + y*dst_stride] -= v;
2674
        }
2675
    }
2676
#else
2677
    if(sliced){
2678
        START_TIMER
2679

    
2680
        s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2681
        STOP_TIMER("inner_add_yblock")
2682
    }else
2683
    for(y=0; y<b_h; y++){
2684
        //FIXME ugly misuse of obmc_stride
2685
        const uint8_t *obmc1= obmc + y*obmc_stride;
2686
        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2687
        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2688
        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2689
        for(x=0; x<b_w; x++){
2690
            int v=   obmc1[x] * block[3][x + y*src_stride]
2691
                    +obmc2[x] * block[2][x + y*src_stride]
2692
                    +obmc3[x] * block[1][x + y*src_stride]
2693
                    +obmc4[x] * block[0][x + y*src_stride];
2694

    
2695
            v <<= 8 - LOG2_OBMC_MAX;
2696
            if(FRAC_BITS != 8){
2697
                v += 1<<(7 - FRAC_BITS);
2698
                v >>= 8 - FRAC_BITS;
2699
            }
2700
            if(add){
2701
                v += dst[x + y*dst_stride];
2702
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2703
                if(v&(~255)) v= ~(v>>31);
2704
                dst8[x + y*src_stride] = v;
2705
            }else{
2706
                dst[x + y*dst_stride] -= v;
2707
            }
2708
        }
2709
    }
2710
#endif
2711
}
2712

    
2713
static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2714
    Plane *p= &s->plane[plane_index];
2715
    const int mb_w= s->b_width  << s->block_max_depth;
2716
    const int mb_h= s->b_height << s->block_max_depth;
2717
    int x, y, mb_x;
2718
    int block_size = MB_SIZE >> s->block_max_depth;
2719
    int block_w    = plane_index ? block_size/2 : block_size;
2720
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2721
    int obmc_stride= plane_index ? block_size : 2*block_size;
2722
    int ref_stride= s->current_picture.linesize[plane_index];
2723
    uint8_t *dst8= s->current_picture.data[plane_index];
2724
    int w= p->width;
2725
    int h= p->height;
2726
    START_TIMER
2727

    
2728
    if(s->keyframe || (s->avctx->debug&512)){
2729
        if(mb_y==mb_h)
2730
            return;
2731

    
2732
        if(add){
2733
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2734
            {
2735
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2736
                DWTELEM * line = sb->line[y];
2737
                for(x=0; x<w; x++)
2738
                {
2739
//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2740
                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2741
                    v >>= FRAC_BITS;
2742
                    if(v&(~255)) v= ~(v>>31);
2743
                    dst8[x + y*ref_stride]= v;
2744
                }
2745
            }
2746
        }else{
2747
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2748
            {
2749
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2750
                DWTELEM * line = sb->line[y];
2751
                for(x=0; x<w; x++)
2752
                {
2753
                    line[x] -= 128 << FRAC_BITS;
2754
//                    buf[x + y*w]-= 128<<FRAC_BITS;
2755
                }
2756
            }
2757
        }
2758

    
2759
        return;
2760
    }
2761

    
2762
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2763
            START_TIMER
2764

    
2765
            add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2766
                       block_w*mb_x - block_w/2,
2767
                       block_w*mb_y - block_w/2,
2768
                       block_w, block_w,
2769
                       w, h,
2770
                       w, ref_stride, obmc_stride,
2771
                       mb_x - 1, mb_y - 1,
2772
                       add, 0, plane_index);
2773

    
2774
            STOP_TIMER("add_yblock")
2775
        }
2776

    
2777
    STOP_TIMER("predict_slice")
2778
}
2779

    
2780
static av_always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2781
    Plane *p= &s->plane[plane_index];
2782
    const int mb_w= s->b_width  << s->block_max_depth;
2783
    const int mb_h= s->b_height << s->block_max_depth;
2784
    int x, y, mb_x;
2785
    int block_size = MB_SIZE >> s->block_max_depth;
2786
    int block_w    = plane_index ? block_size/2 : block_size;
2787
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2788
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2789
    int ref_stride= s->current_picture.linesize[plane_index];
2790
    uint8_t *dst8= s->current_picture.data[plane_index];
2791
    int w= p->width;
2792
    int h= p->height;
2793
    START_TIMER
2794

    
2795
    if(s->keyframe || (s->avctx->debug&512)){
2796
        if(mb_y==mb_h)
2797
            return;
2798

    
2799
        if(add){
2800
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2801
                for(x=0; x<w; x++){
2802
                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2803
                    v >>= FRAC_BITS;
2804
                    if(v&(~255)) v= ~(v>>31);
2805
                    dst8[x + y*ref_stride]= v;
2806
                }
2807
            }
2808
        }else{
2809
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2810
                for(x=0; x<w; x++){
2811
                    buf[x + y*w]-= 128<<FRAC_BITS;
2812
                }
2813
            }
2814
        }
2815

    
2816
        return;
2817
    }
2818

    
2819
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2820
            START_TIMER
2821

    
2822
            add_yblock(s, 0, NULL, buf, dst8, obmc,
2823
                       block_w*mb_x - block_w/2,
2824
                       block_w*mb_y - block_w/2,
2825
                       block_w, block_w,
2826
                       w, h,
2827
                       w, ref_stride, obmc_stride,
2828
                       mb_x - 1, mb_y - 1,
2829
                       add, 1, plane_index);
2830

    
2831
            STOP_TIMER("add_yblock")
2832
        }
2833

    
2834
    STOP_TIMER("predict_slice")
2835
}
2836

    
2837
static av_always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2838
    const int mb_h= s->b_height << s->block_max_depth;
2839
    int mb_y;
2840
    for(mb_y=0; mb_y<=mb_h; mb_y++)
2841
        predict_slice(s, buf, plane_index, add, mb_y);
2842
}
2843

    
2844
static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2845
    int i, x2, y2;
2846
    Plane *p= &s->plane[plane_index];
2847
    const int block_size = MB_SIZE >> s->block_max_depth;
2848
    const int block_w    = plane_index ? block_size/2 : block_size;
2849
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2850
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2851
    const int ref_stride= s->current_picture.linesize[plane_index];
2852
    uint8_t *src= s-> input_picture.data[plane_index];
2853
    DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2854
    const int b_stride = s->b_width << s->block_max_depth;
2855
    const int w= p->width;
2856
    const int h= p->height;
2857
    int index= mb_x + mb_y*b_stride;
2858
    BlockNode *b= &s->block[index];
2859
    BlockNode backup= *b;
2860
    int ab=0;
2861
    int aa=0;
2862

    
2863
    b->type|= BLOCK_INTRA;
2864
    b->color[plane_index]= 0;
2865
    memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2866

    
2867
    for(i=0; i<4; i++){
2868
        int mb_x2= mb_x + (i &1) - 1;
2869
        int mb_y2= mb_y + (i>>1) - 1;
2870
        int x= block_w*mb_x2 + block_w/2;
2871
        int y= block_w*mb_y2 + block_w/2;
2872

    
2873
        add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2874
                    x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2875

    
2876
        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2877
            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2878
                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2879
                int obmc_v= obmc[index];
2880
                int d;
2881
                if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2882
                if(x<0) obmc_v += obmc[index + block_w];
2883
                if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2884
                if(x+block_w>w) obmc_v += obmc[index - block_w];
2885
                //FIXME precalc this or simplify it somehow else
2886

    
2887
                d = -dst[index] + (1<<(FRAC_BITS-1));
2888
                dst[index] = d;
2889
                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2890
                aa += obmc_v * obmc_v; //FIXME precalclate this
2891
            }
2892
        }
2893
    }
2894
    *b= backup;
2895

    
2896
    return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we should not need clipping
2897
}
2898

    
2899
static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2900
    const int b_stride = s->b_width << s->block_max_depth;
2901
    const int b_height = s->b_height<< s->block_max_depth;
2902
    int index= x + y*b_stride;
2903
    const BlockNode *b     = &s->block[index];
2904
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2905
    const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2906
    const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2907
    const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2908
    int dmx, dmy;
2909
//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2910
//  int my_context= av_log2(2*FFABS(left->my - top->my));
2911

    
2912
    if(x<0 || x>=b_stride || y>=b_height)
2913
        return 0;
2914
/*
2915
1            0      0
2916
01X          1-2    1
2917
001XX        3-6    2-3
2918
0001XXX      7-14   4-7
2919
00001XXXX   15-30   8-15
2920
*/
2921
//FIXME try accurate rate
2922
//FIXME intra and inter predictors if surrounding blocks arent the same type
2923
    if(b->type & BLOCK_INTRA){
2924
        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2925
                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
2926
                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
2927
    }else{
2928
        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2929
        dmx-= b->mx;
2930
        dmy-= b->my;
2931
        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2932
                    + av_log2(2*FFABS(dmy))
2933
                    + av_log2(2*b->ref));
2934
    }
2935
}
2936

    
2937
static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2938
    Plane *p= &s->plane[plane_index];
2939
    const int block_size = MB_SIZE >> s->block_max_depth;
2940
    const int block_w    = plane_index ? block_size/2 : block_size;
2941
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2942
    const int ref_stride= s->current_picture.linesize[plane_index];
2943
    uint8_t *dst= s->current_picture.data[plane_index];
2944
    uint8_t *src= s->  input_picture.data[plane_index];
2945
    DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2946
    uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2947
    uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
2948
    const int b_stride = s->b_width << s->block_max_depth;
2949
    const int b_height = s->b_height<< s->block_max_depth;
2950
    const int w= p->width;
2951
    const int h= p->height;
2952
    int distortion;
2953
    int rate= 0;
2954
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2955
    int sx= block_w*mb_x - block_w/2;
2956
    int sy= block_w*mb_y - block_w/2;
2957
    int x0= FFMAX(0,-sx);
2958
    int y0= FFMAX(0,-sy);
2959
    int x1= FFMIN(block_w*2, w-sx);
2960
    int y1= FFMIN(block_w*2, h-sy);
2961
    int i,x,y;
2962

    
2963
    pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2964

    
2965
    for(y=y0; y<y1; y++){
2966
        const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2967
        const DWTELEM *pred1 = pred + y*obmc_stride;
2968
        uint8_t *cur1 = cur + y*ref_stride;
2969
        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2970
        for(x=x0; x<x1; x++){
2971
            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2972
            v = (v + pred1[x]) >> FRAC_BITS;
2973
            if(v&(~255)) v= ~(v>>31);
2974
            dst1[x] = v;
2975
        }
2976
    }
2977

    
2978
    /* copy the regions where obmc[] = (uint8_t)256 */
2979
    if(LOG2_OBMC_MAX == 8
2980
        && (mb_x == 0 || mb_x == b_stride-1)
2981
        && (mb_y == 0 || mb_y == b_height-1)){
2982
        if(mb_x == 0)
2983
            x1 = block_w;
2984
        else
2985
            x0 = block_w;
2986
        if(mb_y == 0)
2987
            y1 = block_w;
2988
        else
2989
            y0 = block_w;
2990
        for(y=y0; y<y1; y++)
2991
            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2992
    }
2993

    
2994
    if(block_w==16){
2995
        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
2996
        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
2997
        /* FIXME cmps overlap but don't cover the wavelet's whole support,
2998
         * so improving the score of one block is not strictly guaranteed to
2999
         * improve the score of the whole frame, so iterative motion est
3000
         * doesn't always converge. */
3001
        if(s->avctx->me_cmp == FF_CMP_W97)
3002
            distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3003
        else if(s->avctx->me_cmp == FF_CMP_W53)
3004
            distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3005
        else{
3006
            distortion = 0;
3007
            for(i=0; i<4; i++){
3008
                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3009
                distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3010
            }
3011
        }
3012
    }else{
3013
        assert(block_w==8);
3014
        distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3015
    }
3016

    
3017
    if(plane_index==0){
3018
        for(i=0; i<4; i++){
3019
/* ..RRr
3020
 * .RXx.
3021
 * rxx..
3022
 */
3023
            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3024
        }
3025
        if(mb_x == b_stride-2)
3026
            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3027
    }
3028
    return distortion + rate*penalty_factor;
3029
}
3030

    
3031
static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3032
    int i, y2;
3033
    Plane *p= &s->plane[plane_index];
3034
    const int block_size = MB_SIZE >> s->block_max_depth;
3035
    const int block_w    = plane_index ? block_size/2 : block_size;
3036
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3037
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3038
    const int ref_stride= s->current_picture.linesize[plane_index];
3039
    uint8_t *dst= s->current_picture.data[plane_index];
3040
    uint8_t *src= s-> input_picture.data[plane_index];
3041
    static const DWTELEM zero_dst[4096]; //FIXME
3042
    const int b_stride = s->b_width << s->block_max_depth;
3043
    const int w= p->width;
3044
    const int h= p->height;
3045
    int distortion= 0;
3046
    int rate= 0;
3047
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3048

    
3049
    for(i=0; i<9; i++){
3050
        int mb_x2= mb_x + (i%3) - 1;
3051
        int mb_y2= mb_y + (i/3) - 1;
3052
        int x= block_w*mb_x2 + block_w/2;
3053
        int y= block_w*mb_y2 + block_w/2;
3054

    
3055
        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
3056
                   x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3057

    
3058
        //FIXME find a cleaner/simpler way to skip the outside stuff
3059
        for(y2= y; y2<0; y2++)
3060
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3061
        for(y2= h; y2<y+block_w; y2++)
3062
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3063
        if(x<0){
3064
            for(y2= y; y2<y+block_w; y2++)
3065
                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3066
        }
3067
        if(x+block_w > w){
3068
            for(y2= y; y2<y+block_w; y2++)
3069
                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3070
        }
3071

    
3072
        assert(block_w== 8 || block_w==16);
3073
        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3074
    }
3075

    
3076
    if(plane_index==0){
3077
        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3078
        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3079

    
3080
/* ..RRRr
3081
 * .RXXx.
3082
 * .RXXx.
3083
 * rxxx.
3084
 */
3085
        if(merged)
3086
            rate = get_block_bits(s, mb_x, mb_y, 2);
3087
        for(i=merged?4:0; i<9; i++){
3088
            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3089
            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3090
        }
3091
    }
3092
    return distortion + rate*penalty_factor;
3093
}
3094

    
3095
static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3096
    const int b_stride= s->b_width << s->block_max_depth;
3097
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3098
    BlockNode backup= *block;
3099
    int rd, index, value;
3100

    
3101
    assert(mb_x>=0 && mb_y>=0);
3102
    assert(mb_x<b_stride);
3103

    
3104
    if(intra){
3105
        block->color[0] = p[0];
3106
        block->color[1] = p[1];
3107
        block->color[2] = p[2];
3108
        block->type |= BLOCK_INTRA;
3109
    }else{
3110
        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3111
        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3112
        if(s->me_cache[index] == value)
3113
            return 0;
3114
        s->me_cache[index]= value;
3115

    
3116
        block->mx= p[0];
3117
        block->my= p[1];
3118
        block->type &= ~BLOCK_INTRA;
3119
    }
3120

    
3121
    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3122

    
3123
//FIXME chroma
3124
    if(rd < *best_rd){
3125
        *best_rd= rd;
3126
        return 1;
3127
    }else{
3128
        *block= backup;
3129
        return 0;
3130
    }
3131
}
3132

    
3133
/* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3134
static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3135
    int p[2] = {p0, p1};
3136
    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3137
}
3138

    
3139
static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3140
    const int b_stride= s->b_width << s->block_max_depth;
3141
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3142
    BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3143
    int rd, index, value;
3144

    
3145
    assert(mb_x>=0 && mb_y>=0);
3146
    assert(mb_x<b_stride);
3147
    assert(((mb_x|mb_y)&1) == 0);
3148

    
3149
    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3150
    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3151
    if(s->me_cache[index] == value)
3152
        return 0;
3153
    s->me_cache[index]= value;
3154

    
3155
    block->mx= p0;
3156
    block->my= p1;
3157
    block->ref= ref;
3158
    block->type &= ~BLOCK_INTRA;
3159
    block[1]= block[b_stride]= block[b_stride+1]= *block;
3160

    
3161
    rd= get_4block_rd(s, mb_x, mb_y, 0);
3162

    
3163
//FIXME chroma
3164
    if(rd < *best_rd){
3165
        *best_rd= rd;
3166
        return 1;
3167
    }else{
3168
        block[0]= backup[0];
3169
        block[1]= backup[1];
3170
        block[b_stride]= backup[2];
3171
        block[b_stride+1]= backup[3];
3172
        return 0;
3173
    }
3174
}
3175

    
3176
static void iterative_me(SnowContext *s){
3177
    int pass, mb_x, mb_y;
3178
    const int b_width = s->b_width  << s->block_max_depth;
3179
    const int b_height= s->b_height << s->block_max_depth;
3180
    const int b_stride= b_width;
3181
    int color[3];
3182

    
3183
    {
3184
        RangeCoder r = s->c;
3185
        uint8_t state[sizeof(s->block_state)];
3186
        memcpy(state, s->block_state, sizeof(s->block_state));
3187
        for(mb_y= 0; mb_y<s->b_height; mb_y++)
3188
            for(mb_x= 0; mb_x<s->b_width; mb_x++)
3189
                encode_q_branch(s, 0, mb_x, mb_y);
3190
        s->c = r;
3191
        memcpy(s->block_state, state, sizeof(s->block_state));
3192
    }
3193

    
3194
    for(pass=0; pass<25; pass++){
3195
        int change= 0;
3196

    
3197
        for(mb_y= 0; mb_y<b_height; mb_y++){
3198
            for(mb_x= 0; mb_x<b_width; mb_x++){
3199
                int dia_change, i, j, ref;
3200
                int best_rd= INT_MAX, ref_rd;
3201
                BlockNode backup, ref_b;
3202
                const int index= mb_x + mb_y * b_stride;
3203
                BlockNode *block= &s->block[index];
3204
                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3205
                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3206
                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3207
                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3208
                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3209
                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3210
                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3211
                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3212
                const int b_w= (MB_SIZE >> s->block_max_depth);
3213
                uint8_t obmc_edged[b_w*2][b_w*2];
3214

    
3215
                if(pass && (block->type & BLOCK_OPT))
3216
                    continue;
3217
                block->type |= BLOCK_OPT;
3218

    
3219
                backup= *block;
3220

    
3221
                if(!s->me_cache_generation)
3222
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3223
                s->me_cache_generation += 1<<22;
3224

    
3225
                //FIXME precalc
3226
                {
3227
                    int x, y;
3228
                    memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3229
                    if(mb_x==0)
3230
                        for(y=0; y<b_w*2; y++)
3231
                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3232
                    if(mb_x==b_stride-1)
3233
                        for(y=0; y<b_w*2; y++)
3234
                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3235
                    if(mb_y==0){
3236
                        for(x=0; x<b_w*2; x++)
3237
                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
3238
                        for(y=1; y<b_w; y++)
3239
                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3240
                    }
3241
                    if(mb_y==b_height-1){
3242
                        for(x=0; x<b_w*2; x++)
3243
                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3244
                        for(y=b_w; y<b_w*2-1; y++)
3245
                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3246
                    }
3247
                }
3248

    
3249
                //skip stuff outside the picture
3250
                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3251
                {
3252
                    uint8_t *src= s->  input_picture.data[0];
3253
                    uint8_t *dst= s->current_picture.data[0];
3254
                    const int stride= s->current_picture.linesize[0];
3255
                    const int block_w= MB_SIZE >> s->block_max_depth;
3256
                    const int sx= block_w*mb_x - block_w/2;
3257
                    const int sy= block_w*mb_y - block_w/2;
3258
                    const int w= s->plane[0].width;
3259
                    const int h= s->plane[0].height;
3260
                    int y;
3261

    
3262
                    for(y=sy; y<0; y++)
3263
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3264
                    for(y=h; y<sy+block_w*2; y++)
3265
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3266
                    if(sx<0){
3267
                        for(y=sy; y<sy+block_w*2; y++)
3268
                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3269
                    }
3270
                    if(sx+block_w*2 > w){
3271
                        for(y=sy; y<sy+block_w*2; y++)
3272
                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3273
                    }
3274
                }
3275

    
3276
                // intra(black) = neighbors' contribution to the current block
3277
                for(i=0; i<3; i++)
3278
                    color[i]= get_dc(s, mb_x, mb_y, i);
3279

    
3280
                // get previous score (cannot be cached due to OBMC)
3281
                if(pass > 0 && (block->type&BLOCK_INTRA)){
3282
                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
3283
                    check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3284
                }else
3285
                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3286

    
3287
                ref_b= *block;
3288
                ref_rd= best_rd;
3289
                for(ref=0; ref < s->ref_frames; ref++){
3290
                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3291
                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3292
                        continue;
3293
                    block->ref= ref;
3294
                    best_rd= INT_MAX;
3295

    
3296
                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3297
                    check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3298
                    if(tb)
3299
                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3300
                    if(lb)
3301
                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3302
                    if(rb)
3303
                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3304
                    if(bb)
3305
                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3306

    
3307
                    /* fullpel ME */
3308
                    //FIXME avoid subpel interpol / round to nearest integer
3309
                    do{
3310
                        dia_change=0;
3311
                        for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3312
                            for(j=0; j<i; j++){
3313
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3314
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3315
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3316
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3317
                            }
3318
                        }
3319
                    }while(dia_change);
3320
                    /* subpel ME */
3321
                    do{
3322
                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3323
                        dia_change=0;
3324
                        for(i=0; i<8; i++)
3325
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3326
                    }while(dia_change);
3327
                    //FIXME or try the standard 2 pass qpel or similar
3328

    
3329
                    mvr[0][0]= block->mx;
3330
                    mvr[0][1]= block->my;
3331
                    if(ref_rd > best_rd){
3332
                        ref_rd= best_rd;
3333
                        ref_b= *block;
3334
                    }
3335
                }
3336
                best_rd= ref_rd;
3337
                *block= ref_b;
3338
#if 1
3339
                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3340
                //FIXME RD style color selection
3341
#endif
3342
                if(!same_block(block, &backup)){
3343
                    if(tb ) tb ->type &= ~BLOCK_OPT;
3344
                    if(lb ) lb ->type &= ~BLOCK_OPT;
3345
                    if(rb ) rb ->type &= ~BLOCK_OPT;
3346
                    if(bb ) bb ->type &= ~BLOCK_OPT;
3347
                    if(tlb) tlb->type &= ~BLOCK_OPT;
3348
                    if(trb) trb->type &= ~BLOCK_OPT;
3349
                    if(blb) blb->type &= ~BLOCK_OPT;
3350
                    if(brb) brb->type &= ~BLOCK_OPT;
3351
                    change ++;
3352
                }
3353
            }
3354
        }
3355
        av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3356
        if(!change)
3357
            break;
3358
    }
3359

    
3360
    if(s->block_max_depth == 1){
3361
        int change= 0;
3362
        for(mb_y= 0; mb_y<b_height; mb_y+=2){
3363
            for(mb_x= 0; mb_x<b_width; mb_x+=2){
3364
                int i;
3365
                int best_rd, init_rd;
3366
                const int index= mb_x + mb_y * b_stride;
3367
                BlockNode *b[4];
3368

    
3369
                b[0]= &s->block[index];
3370
                b[1]= b[0]+1;
3371
                b[2]= b[0]+b_stride;
3372
                b[3]= b[2]+1;
3373
                if(same_block(b[0], b[1]) &&
3374
                   same_block(b[0], b[2]) &&
3375
                   same_block(b[0], b[3]))
3376
                    continue;
3377

    
3378
                if(!s->me_cache_generation)
3379
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3380
                s->me_cache_generation += 1<<22;
3381

    
3382
                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3383

    
3384
                //FIXME more multiref search?
3385
                check_4block_inter(s, mb_x, mb_y,
3386
                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3387
                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3388

    
3389
                for(i=0; i<4; i++)
3390
                    if(!(b[i]->type&BLOCK_INTRA))
3391
                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3392

    
3393
                if(init_rd != best_rd)
3394
                    change++;
3395
            }
3396
        }
3397
        av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3398
    }
3399
}
3400

    
3401
static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3402
    const int level= b->level;
3403
    const int w= b->width;
3404
    const int h= b->height;
3405
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3406
    const int qmul= qexp[qlog&(QROOT-1)]<<((qlog>>QSHIFT) + ENCODER_EXTRA_BITS);
3407
    int x,y, thres1, thres2;
3408
//    START_TIMER
3409

    
3410
    if(s->qlog == LOSSLESS_QLOG) return;
3411

    
3412
    bias= bias ? 0 : (3*qmul)>>3;
3413
    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3414
    thres2= 2*thres1;
3415

    
3416
    if(!bias){
3417
        for(y=0; y<h; y++){
3418
            for(x=0; x<w; x++){
3419
                int i= src[x + y*stride];
3420

    
3421
                if((unsigned)(i+thres1) > thres2){
3422
                    if(i>=0){
3423
                        i<<= QEXPSHIFT;
3424
                        i/= qmul; //FIXME optimize
3425
                        src[x + y*stride]=  i;
3426
                    }else{
3427
                        i= -i;
3428
                        i<<= QEXPSHIFT;
3429
                        i/= qmul; //FIXME optimize
3430
                        src[x + y*stride]= -i;
3431
                    }
3432
                }else
3433
                    src[x + y*stride]= 0;
3434
            }
3435
        }
3436
    }else{
3437
        for(y=0; y<h; y++){
3438
            for(x=0; x<w; x++){
3439
                int i= src[x + y*stride];
3440

    
3441
                if((unsigned)(i+thres1) > thres2){
3442
                    if(i>=0){
3443
                        i<<= QEXPSHIFT;
3444
                        i= (i + bias) / qmul; //FIXME optimize
3445
                        src[x + y*stride]=  i;
3446
                    }else{
3447
                        i= -i;
3448
                        i<<= QEXPSHIFT;
3449
                        i= (i + bias) / qmul; //FIXME optimize
3450
                        src[x + y*stride]= -i;
3451
                    }
3452
                }else
3453
                    src[x + y*stride]= 0;
3454
            }
3455
        }
3456
    }
3457
    if(level+1 == s->spatial_decomposition_count){
3458
//        STOP_TIMER("quantize")
3459
    }
3460
}
3461

    
3462
static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3463
    const int w= b->width;
3464
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3465
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3466
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3467
    int x,y;
3468
    START_TIMER
3469

    
3470
    if(s->qlog == LOSSLESS_QLOG) return;
3471

    
3472
    for(y=start_y; y<end_y; y++){
3473
//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3474
        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3475
        for(x=0; x<w; x++){
3476
            int i= line[x];
3477
            if(i<0){
3478
                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3479
            }else if(i>0){
3480
                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3481
            }
3482
        }
3483
    }
3484
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3485
        STOP_TIMER("dquant")
3486
    }
3487
}
3488

    
3489
static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3490
    const int w= b->width;
3491
    const int h= b->height;
3492
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3493
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3494
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3495
    int x,y;
3496
    START_TIMER
3497

    
3498
    if(s->qlog == LOSSLESS_QLOG) return;
3499

    
3500
    for(y=0; y<h; y++){
3501
        for(x=0; x<w; x++){
3502
            int i= src[x + y*stride];
3503
            if(i<0){
3504
                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3505
            }else if(i>0){
3506
                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3507
            }
3508
        }
3509
    }
3510
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3511
        STOP_TIMER("dquant")
3512
    }
3513
}
3514

    
3515
static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3516
    const int w= b->width;
3517
    const int h= b->height;
3518
    int x,y;
3519

    
3520
    for(y=h-1; y>=0; y--){
3521
        for(x=w-1; x>=0; x--){
3522
            int i= x + y*stride;
3523

    
3524
            if(x){
3525
                if(use_median){
3526
                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3527
                    else  src[i] -= src[i - 1];
3528
                }else{
3529
                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3530
                    else  src[i] -= src[i - 1];
3531
                }
3532
            }else{
3533
                if(y) src[i] -= src[i - stride];
3534
            }
3535
        }
3536
    }
3537
}
3538

    
3539
static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3540
    const int w= b->width;
3541
    int x,y;
3542

    
3543
//    START_TIMER
3544

    
3545
    DWTELEM * line=0; // silence silly "could be used without having been initialized" warning
3546
    DWTELEM * prev;
3547

    
3548
    if (start_y != 0)
3549
        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3550

    
3551
    for(y=start_y; y<end_y; y++){
3552
        prev = line;
3553
//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3554
        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3555
        for(x=0; x<w; x++){
3556
            if(x){
3557
                if(use_median){
3558
                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3559
                    else  line[x] += line[x - 1];
3560
                }else{
3561
                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3562
                    else  line[x] += line[x - 1];
3563
                }
3564
            }else{
3565
                if(y) line[x] += prev[x];
3566
            }
3567
        }
3568
    }
3569

    
3570
//    STOP_TIMER("correlate")
3571
}
3572

    
3573
static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3574
    const int w= b->width;
3575
    const int h= b->height;
3576
    int x,y;
3577

    
3578
    for(y=0; y<h; y++){
3579
        for(x=0; x<w; x++){
3580
            int i= x + y*stride;
3581

    
3582
            if(x){
3583
                if(use_median){
3584
                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3585
                    else  src[i] += src[i - 1];
3586
                }else{
3587
                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3588
                    else  src[i] += src[i - 1];
3589
                }
3590
            }else{
3591
                if(y) src[i] += src[i - stride];
3592
            }
3593
        }
3594
    }
3595
}
3596

    
3597
static void encode_header(SnowContext *s){
3598
    int plane_index, level, orientation;
3599
    uint8_t kstate[32];
3600

    
3601
    memset(kstate, MID_STATE, sizeof(kstate));
3602

    
3603
    put_rac(&s->c, kstate, s->keyframe);
3604
    if(s->keyframe || s->always_reset){
3605
        reset_contexts(s);
3606
        s->last_spatial_decomposition_type=
3607
        s->last_qlog=
3608
        s->last_qbias=
3609
        s->last_mv_scale=
3610
        s->last_block_max_depth= 0;
3611
    }
3612
    if(s->keyframe){
3613
        put_symbol(&s->c, s->header_state, s->version, 0);
3614
        put_rac(&s->c, s->header_state, s->always_reset);
3615
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3616
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3617
        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3618
        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3619
        put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3620
        put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3621
        put_rac(&s->c, s->header_state, s->spatial_scalability);
3622
//        put_rac(&s->c, s->header_state, s->rate_scalability);
3623
        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3624

    
3625
        for(plane_index=0; plane_index<2; plane_index++){
3626
            for(level=0; level<s->spatial_decomposition_count; level++){
3627
                for(orientation=level ? 1:0; orientation<4; orientation++){
3628
                    if(orientation==2) continue;
3629
                    put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3630
                }
3631
            }
3632
        }
3633
    }
3634
    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3635
    put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3636
    put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3637
    put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3638
    put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3639

    
3640
    s->last_spatial_decomposition_type= s->spatial_decomposition_type;
3641
    s->last_qlog                      = s->qlog;
3642
    s->last_qbias                     = s->qbias;
3643
    s->last_mv_scale                  = s->mv_scale;
3644
    s->last_block_max_depth           = s->block_max_depth;
3645
}
3646

    
3647
static int decode_header(SnowContext *s){
3648
    int plane_index, level, orientation;
3649
    uint8_t kstate[32];
3650

    
3651
    memset(kstate, MID_STATE, sizeof(kstate));
3652

    
3653
    s->keyframe= get_rac(&s->c, kstate);
3654
    if(s->keyframe || s->always_reset){
3655
        reset_contexts(s);
3656
        s->spatial_decomposition_type=
3657
        s->qlog=
3658
        s->qbias=
3659
        s->mv_scale=
3660
        s->block_max_depth= 0;
3661
    }
3662
    if(s->keyframe){
3663
        s->version= get_symbol(&s->c, s->header_state, 0);
3664
        if(s->version>0){
3665
            av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3666
            return -1;
3667
        }
3668
        s->always_reset= get_rac(&s->c, s->header_state);
3669
        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3670
        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3671
        s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3672
        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3673
        s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3674
        s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3675
        s->spatial_scalability= get_rac(&s->c, s->header_state);
3676
//        s->rate_scalability= get_rac(&s->c, s->header_state);
3677
        s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3678

    
3679
        for(plane_index=0; plane_index<3; plane_index++){
3680
            for(level=0; level<s->spatial_decomposition_count; level++){
3681
                for(orientation=level ? 1:0; orientation<4; orientation++){
3682
                    int q;
3683
                    if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3684
                    else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3685
                    else                    q= get_symbol(&s->c, s->header_state, 1);
3686
                    s->plane[plane_index].band[level][orientation].qlog= q;
3687
                }
3688
            }
3689
        }
3690
    }
3691

    
3692
    s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3693
    if(s->spatial_decomposition_type > 2){
3694
        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3695
        return -1;
3696
    }
3697

    
3698
    s->qlog           += get_symbol(&s->c, s->header_state, 1);
3699
    s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3700
    s->qbias          += get_symbol(&s->c, s->header_state, 1);
3701
    s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3702
    if(s->block_max_depth > 1 || s->block_max_depth < 0){
3703
        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3704
        s->block_max_depth= 0;
3705
        return -1;
3706
    }
3707

    
3708
    return 0;
3709
}
3710

    
3711
static void init_qexp(void){
3712
    int i;
3713
    double v=128;
3714

    
3715
    for(i=0; i<QROOT; i++){
3716
        qexp[i]= lrintf(v);
3717
        v *= pow(2, 1.0 / QROOT);
3718
    }
3719
}
3720

    
3721
static int common_init(AVCodecContext *avctx){
3722
    SnowContext *s = avctx->priv_data;
3723
    int width, height;
3724
    int level, orientation, plane_index, dec;
3725
    int i, j;
3726

    
3727
    s->avctx= avctx;
3728

    
3729
    dsputil_init(&s->dsp, avctx);
3730

    
3731
#define mcf(dx,dy)\
3732
    s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3733
    s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3734
        s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3735
    s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3736
    s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3737
        s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3738

    
3739
    mcf( 0, 0)
3740
    mcf( 4, 0)
3741
    mcf( 8, 0)
3742
    mcf(12, 0)
3743
    mcf( 0, 4)
3744
    mcf( 4, 4)
3745
    mcf( 8, 4)
3746
    mcf(12, 4)
3747
    mcf( 0, 8)
3748
    mcf( 4, 8)
3749
    mcf( 8, 8)
3750
    mcf(12, 8)
3751
    mcf( 0,12)
3752
    mcf( 4,12)
3753
    mcf( 8,12)
3754
    mcf(12,12)
3755

    
3756
#define mcfh(dx,dy)\
3757
    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3758
    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3759
        mc_block_hpel ## dx ## dy ## 16;\
3760
    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3761
    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3762
        mc_block_hpel ## dx ## dy ## 8;
3763

    
3764
    mcfh(0, 0)
3765
    mcfh(8, 0)
3766
    mcfh(0, 8)
3767
    mcfh(8, 8)
3768

    
3769
    if(!qexp[0])
3770
        init_qexp();
3771

    
3772
    dec= s->spatial_decomposition_count= 5;
3773
    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3774

    
3775
    s->chroma_h_shift= 1; //FIXME XXX
3776
    s->chroma_v_shift= 1;
3777

    
3778
//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3779

    
3780
    width= s->avctx->width;
3781
    height= s->avctx->height;
3782

    
3783
    s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3784

    
3785
    s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3786
    s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3787

    
3788
    for(plane_index=0; plane_index<3; plane_index++){
3789
        int w= s->avctx->width;
3790
        int h= s->avctx->height;
3791

    
3792
        if(plane_index){
3793
            w>>= s->chroma_h_shift;
3794
            h>>= s->chroma_v_shift;
3795
        }
3796
        s->plane[plane_index].width = w;
3797
        s->plane[plane_index].height= h;
3798
//av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3799
        for(level=s->spatial_decomposition_count-1; level>=0; level--){
3800
            for(orientation=level ? 1 : 0; orientation<4; orientation++){
3801
                SubBand *b= &s->plane[plane_index].band[level][orientation];
3802

    
3803
                b->buf= s->spatial_dwt_buffer;
3804
                b->level= level;
3805
                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3806
                b->width = (w + !(orientation&1))>>1;
3807
                b->height= (h + !(orientation>1))>>1;
3808

    
3809
                b->stride_line = 1 << (s->spatial_decomposition_count - level);
3810
                b->buf_x_offset = 0;
3811
                b->buf_y_offset = 0;
3812

    
3813
                if(orientation&1){
3814
                    b->buf += (w+1)>>1;
3815
                    b->buf_x_offset = (w+1)>>1;
3816
                }
3817
                if(orientation>1){
3818
                    b->buf += b->stride>>1;
3819
                    b->buf_y_offset = b->stride_line >> 1;
3820
                }
3821

    
3822
                if(level)
3823
                    b->parent= &s->plane[plane_index].band[level-1][orientation];
3824
                b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3825
            }
3826
            w= (w+1)>>1;
3827
            h= (h+1)>>1;
3828
        }
3829
    }
3830

    
3831
    for(i=0; i<MAX_REF_FRAMES; i++)
3832
        for(j=0; j<MAX_REF_FRAMES; j++)
3833
            scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3834

    
3835
    reset_contexts(s);
3836
/*
3837
    width= s->width= avctx->width;
3838
    height= s->height= avctx->height;
3839

3840
    assert(width && height);
3841
*/
3842
    s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3843

    
3844
    return 0;
3845
}
3846

    
3847
static int qscale2qlog(int qscale){
3848
    return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3849
           + 61*QROOT/8; //<64 >60
3850
}
3851

    
3852
static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3853
{
3854
    /* estimate the frame's complexity as a sum of weighted dwt coefs.
3855
     * FIXME we know exact mv bits at this point,
3856
     * but ratecontrol isn't set up to include them. */
3857
    uint32_t coef_sum= 0;
3858
    int level, orientation, delta_qlog;
3859

    
3860
    for(level=0; level<s->spatial_decomposition_count; level++){
3861
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3862
            SubBand *b= &s->plane[0].band[level][orientation];
3863
            DWTELEM *buf= b->buf;
3864
            const int w= b->width;
3865
            const int h= b->height;
3866
            const int stride= b->stride;
3867
            const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3868
            const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3869
            const int qdiv= (1<<16)/qmul;
3870
            int x, y;
3871
            if(orientation==0)
3872
                decorrelate(s, b, buf, stride, 1, 0);
3873
            for(y=0; y<h; y++)
3874
                for(x=0; x<w; x++)
3875
                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3876
            if(orientation==0)
3877
                correlate(s, b, buf, stride, 1, 0);
3878
        }
3879
    }
3880

    
3881
    /* ugly, ratecontrol just takes a sqrt again */
3882
    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3883
    assert(coef_sum < INT_MAX);
3884

    
3885
    if(pict->pict_type == I_TYPE){
3886
        s->m.current_picture.mb_var_sum= coef_sum;
3887
        s->m.current_picture.mc_mb_var_sum= 0;
3888
    }else{
3889
        s->m.current_picture.mc_mb_var_sum= coef_sum;
3890
        s->m.current_picture.mb_var_sum= 0;
3891
    }
3892

    
3893
    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3894
    if (pict->quality < 0)
3895
        return INT_MIN;
3896
    s->lambda= pict->quality * 3/2;
3897
    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3898
    s->qlog+= delta_qlog;
3899
    return delta_qlog;
3900
}
3901

    
3902
static void calculate_vissual_weight(SnowContext *s, Plane *p){
3903
    int width = p->width;
3904
    int height= p->height;
3905
    int level, orientation, x, y;
3906

    
3907
    for(level=0; level<s->spatial_decomposition_count; level++){
3908
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3909
            SubBand *b= &p->band[level][orientation];
3910
            DWTELEM *buf= b->buf;
3911
            int64_t error=0;
3912

    
3913
            memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3914
            buf[b->width/2 + b->height/2*b->stride]= 256*256;
3915
            ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3916
            for(y=0; y<height; y++){
3917
                for(x=0; x<width; x++){
3918
                    int64_t d= s->spatial_dwt_buffer[x + y*width];
3919
                    error += d*d;
3920
                }
3921
            }
3922

    
3923
            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3924
//            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3925
        }
3926
    }
3927
}
3928

    
3929
static int encode_init(AVCodecContext *avctx)
3930
{
3931
    SnowContext *s = avctx->priv_data;
3932
    int plane_index;
3933

    
3934
    if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3935
        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3936
               "use vstrict=-2 / -strict -2 to use it anyway\n");
3937
        return -1;
3938
    }
3939

    
3940
    if(avctx->prediction_method == DWT_97
3941
       && (avctx->flags & CODEC_FLAG_QSCALE)
3942
       && avctx->global_quality == 0){
3943
        av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
3944
        return -1;
3945
    }
3946

    
3947
    common_init(avctx);
3948
    alloc_blocks(s);
3949

    
3950
    s->version=0;
3951

    
3952
    s->m.avctx   = avctx;
3953
    s->m.flags   = avctx->flags;
3954
    s->m.bit_rate= avctx->bit_rate;
3955

    
3956
    s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3957
    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3958
    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3959
    s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3960
    h263_encode_init(&s->m); //mv_penalty
3961

    
3962
    s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
3963

    
3964
    if(avctx->flags&CODEC_FLAG_PASS1){
3965
        if(!avctx->stats_out)
3966
            avctx->stats_out = av_mallocz(256);
3967
    }
3968
    if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
3969
        if(ff_rate_control_init(&s->m) < 0)
3970
            return -1;
3971
    }
3972
    s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
3973

    
3974
    for(plane_index=0; plane_index<3; plane_index++){
3975
        calculate_vissual_weight(s, &s->plane[plane_index]);
3976
    }
3977

    
3978

    
3979
    avctx->coded_frame= &s->current_picture;
3980
    switch(avctx->pix_fmt){
3981
//    case PIX_FMT_YUV444P:
3982
//    case PIX_FMT_YUV422P:
3983
    case PIX_FMT_YUV420P:
3984
    case PIX_FMT_GRAY8:
3985
//    case PIX_FMT_YUV411P:
3986
//    case PIX_FMT_YUV410P:
3987
        s->colorspace_type= 0;
3988
        break;
3989
/*    case PIX_FMT_RGB32:
3990
        s->colorspace= 1;
3991
        break;*/
3992
    default:
3993
        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
3994
        return -1;
3995
    }
3996
//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
3997
    s->chroma_h_shift= 1;
3998
    s->chroma_v_shift= 1;
3999

    
4000
    ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4001
    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4002

    
4003
    s->avctx->get_buffer(s->avctx, &s->input_picture);
4004

    
4005
    if(s->avctx->me_method == ME_ITER){
4006
        int i;
4007
        int size= s->b_width * s->b_height << 2*s->block_max_depth;
4008
        for(i=0; i<s->max_ref_frames; i++){
4009
            s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4010
            s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4011
        }
4012
    }
4013

    
4014
    return 0;
4015
}
4016

    
4017
static int frame_start(SnowContext *s){
4018
   AVFrame tmp;
4019
   int w= s->avctx->width; //FIXME round up to x16 ?
4020
   int h= s->avctx->height;
4021

    
4022
    if(s->current_picture.data[0]){
4023
        draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4024
        draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4025
        draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4026
    }
4027

    
4028
    tmp= s->last_picture[s->max_ref_frames-1];
4029
    memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4030
    s->last_picture[0]= s->current_picture;
4031
    s->current_picture= tmp;
4032

    
4033
    if(s->keyframe){
4034
        s->ref_frames= 0;
4035
    }else{
4036
        int i;
4037
        for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4038
            if(i && s->last_picture[i-1].key_frame)
4039
                break;
4040
        s->ref_frames= i;
4041
    }