Statistics
| Branch: | Revision:

ffmpeg / libavcodec / snow.c @ 2029f312

History | View | Annotate | Download (168 KB)

1
/*
2
 * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "avcodec.h"
22
#include "dsputil.h"
23
#include "snow.h"
24

    
25
#include "rangecoder.h"
26

    
27
#include "mpegvideo.h"
28

    
29
#undef NDEBUG
30
#include <assert.h>
31

    
32
static const int8_t quant3[256]={
33
 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
42
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
43
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
44
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
45
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
46
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
47
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
48
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
49
};
50
static const int8_t quant3b[256]={
51
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
60
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
61
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
62
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
63
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
64
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
65
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
66
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
67
};
68
static const int8_t quant3bA[256]={
69
 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
70
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
71
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
72
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
73
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
74
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
75
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
76
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
77
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
78
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
79
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
80
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
81
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
82
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
83
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
84
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
85
};
86
static const int8_t quant5[256]={
87
 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
88
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
91
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
92
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
93
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
94
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
95
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
96
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
97
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
98
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
99
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
100
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
101
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
102
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
103
};
104
static const int8_t quant7[256]={
105
 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107
 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
108
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
109
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
110
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
111
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
112
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
113
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
114
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
115
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
116
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
117
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
118
-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
119
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
120
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
121
};
122
static const int8_t quant9[256]={
123
 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
124
 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
125
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
126
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
127
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
128
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
129
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
130
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
131
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
132
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
133
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
134
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
135
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
136
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
137
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
138
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
139
};
140
static const int8_t quant11[256]={
141
 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
142
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
143
 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
144
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
145
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
146
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
147
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
148
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
149
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
150
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
151
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
152
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
153
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
154
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
155
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
156
-4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
157
};
158
static const int8_t quant13[256]={
159
 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
160
 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
161
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
162
 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
163
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
164
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
165
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
166
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
167
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
168
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
169
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
170
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
171
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
172
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
173
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
174
-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
175
};
176

    
177
#if 0 //64*cubic
178
static const uint8_t obmc32[1024]={
179
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
181
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
182
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
183
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
184
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
185
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
186
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
187
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
188
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
189
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
190
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
191
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
192
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
193
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
194
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
195
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
196
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
197
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
198
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
199
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
200
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
201
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
202
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
203
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
204
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
205
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
206
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
207
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
208
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
209
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
210
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
211
//error:0.000022
212
};
213
static const uint8_t obmc16[256]={
214
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
215
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
216
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
217
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
218
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
219
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
220
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
221
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
222
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
223
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
224
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
225
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
226
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
227
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
228
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
229
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
230
//error:0.000033
231
};
232
#elif 1 // 64*linear
233
static const uint8_t obmc32[1024]={
234
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
235
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
236
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
237
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
238
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
239
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
240
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
241
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
242
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
243
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
244
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
245
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
246
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
247
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
248
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
249
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
250
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
251
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
252
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
253
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
254
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
255
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
256
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
257
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
258
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
259
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
260
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
261
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
262
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
263
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
264
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
265
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
266
 //error:0.000020
267
};
268
static const uint8_t obmc16[256]={
269
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
270
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
271
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
272
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
273
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
274
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
275
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
276
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
277
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
278
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
279
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
280
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
281
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
282
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
283
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
284
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
285
//error:0.000015
286
};
287
#else //64*cos
288
static const uint8_t obmc32[1024]={
289
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
290
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
291
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
292
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
293
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
294
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
295
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
296
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
297
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
298
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
299
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
300
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
301
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
302
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
303
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
304
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
305
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
306
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
307
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
308
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
309
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
310
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
311
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
312
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
313
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
314
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
315
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
316
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
317
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
318
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
319
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
320
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
321
//error:0.000022
322
};
323
static const uint8_t obmc16[256]={
324
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
325
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
326
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
327
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
328
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
329
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
330
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
331
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
332
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
333
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
334
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
335
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
336
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
337
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
338
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
339
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
340
//error:0.000022
341
};
342
#endif
343

    
344
//linear *64
345
static const uint8_t obmc8[64]={
346
  4, 12, 20, 28, 28, 20, 12,  4,
347
 12, 36, 60, 84, 84, 60, 36, 12,
348
 20, 60,100,140,140,100, 60, 20,
349
 28, 84,140,196,196,140, 84, 28,
350
 28, 84,140,196,196,140, 84, 28,
351
 20, 60,100,140,140,100, 60, 20,
352
 12, 36, 60, 84, 84, 60, 36, 12,
353
  4, 12, 20, 28, 28, 20, 12,  4,
354
//error:0.000000
355
};
356

    
357
//linear *64
358
static const uint8_t obmc4[16]={
359
 16, 48, 48, 16,
360
 48,144,144, 48,
361
 48,144,144, 48,
362
 16, 48, 48, 16,
363
//error:0.000000
364
};
365

    
366
static const uint8_t *obmc_tab[4]={
367
    obmc32, obmc16, obmc8, obmc4
368
};
369

    
370
static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
371

    
372
typedef struct BlockNode{
373
    int16_t mx;
374
    int16_t my;
375
    uint8_t ref;
376
    uint8_t color[3];
377
    uint8_t type;
378
//#define TYPE_SPLIT    1
379
#define BLOCK_INTRA   1
380
#define BLOCK_OPT     2
381
//#define TYPE_NOCOLOR  4
382
    uint8_t level; //FIXME merge into type?
383
}BlockNode;
384

    
385
static const BlockNode null_block= { //FIXME add border maybe
386
    .color= {128,128,128},
387
    .mx= 0,
388
    .my= 0,
389
    .ref= 0,
390
    .type= 0,
391
    .level= 0,
392
};
393

    
394
#define LOG2_MB_SIZE 4
395
#define MB_SIZE (1<<LOG2_MB_SIZE)
396

    
397
typedef struct x_and_coeff{
398
    int16_t x;
399
    uint16_t coeff;
400
} x_and_coeff;
401

    
402
typedef struct SubBand{
403
    int level;
404
    int stride;
405
    int width;
406
    int height;
407
    int qlog;                                   ///< log(qscale)/log[2^(1/6)]
408
    DWTELEM *buf;
409
    int buf_x_offset;
410
    int buf_y_offset;
411
    int stride_line; ///< Stride measured in lines, not pixels.
412
    x_and_coeff * x_coeff;
413
    struct SubBand *parent;
414
    uint8_t state[/*7*2*/ 7 + 512][32];
415
}SubBand;
416

    
417
typedef struct Plane{
418
    int width;
419
    int height;
420
    SubBand band[MAX_DECOMPOSITIONS][4];
421
}Plane;
422

    
423
typedef struct SnowContext{
424
//    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
425

    
426
    AVCodecContext *avctx;
427
    RangeCoder c;
428
    DSPContext dsp;
429
    AVFrame new_picture;
430
    AVFrame input_picture;              ///< new_picture with the internal linesizes
431
    AVFrame current_picture;
432
    AVFrame last_picture[MAX_REF_FRAMES];
433
    AVFrame mconly_picture;
434
//     uint8_t q_context[16];
435
    uint8_t header_state[32];
436
    uint8_t block_state[128 + 32*128];
437
    int keyframe;
438
    int always_reset;
439
    int version;
440
    int spatial_decomposition_type;
441
    int last_spatial_decomposition_type;
442
    int temporal_decomposition_type;
443
    int spatial_decomposition_count;
444
    int temporal_decomposition_count;
445
    int max_ref_frames;
446
    int ref_frames;
447
    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
448
    uint32_t *ref_scores[MAX_REF_FRAMES];
449
    DWTELEM *spatial_dwt_buffer;
450
    int colorspace_type;
451
    int chroma_h_shift;
452
    int chroma_v_shift;
453
    int spatial_scalability;
454
    int qlog;
455
    int last_qlog;
456
    int lambda;
457
    int lambda2;
458
    int pass1_rc;
459
    int mv_scale;
460
    int last_mv_scale;
461
    int qbias;
462
    int last_qbias;
463
#define QBIAS_SHIFT 3
464
    int b_width;
465
    int b_height;
466
    int block_max_depth;
467
    int last_block_max_depth;
468
    Plane plane[MAX_PLANES];
469
    BlockNode *block;
470
#define ME_CACHE_SIZE 1024
471
    int me_cache[ME_CACHE_SIZE];
472
    int me_cache_generation;
473
    slice_buffer sb;
474

    
475
    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
476
}SnowContext;
477

    
478
typedef struct {
479
    DWTELEM *b0;
480
    DWTELEM *b1;
481
    DWTELEM *b2;
482
    DWTELEM *b3;
483
    int y;
484
} dwt_compose_t;
485

    
486
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
487
//#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
488

    
489
static void iterative_me(SnowContext *s);
490

    
491
static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
492
{
493
    int i;
494

    
495
    buf->base_buffer = base_buffer;
496
    buf->line_count = line_count;
497
    buf->line_width = line_width;
498
    buf->data_count = max_allocated_lines;
499
    buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
500
    buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
501

    
502
    for (i = 0; i < max_allocated_lines; i++)
503
    {
504
      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
505
    }
506

    
507
    buf->data_stack_top = max_allocated_lines - 1;
508
}
509

    
510
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
511
{
512
    int offset;
513
    DWTELEM * buffer;
514

    
515
//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
516

    
517
    assert(buf->data_stack_top >= 0);
518
//  assert(!buf->line[line]);
519
    if (buf->line[line])
520
        return buf->line[line];
521

    
522
    offset = buf->line_width * line;
523
    buffer = buf->data_stack[buf->data_stack_top];
524
    buf->data_stack_top--;
525
    buf->line[line] = buffer;
526

    
527
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
528

    
529
    return buffer;
530
}
531

    
532
static void slice_buffer_release(slice_buffer * buf, int line)
533
{
534
    int offset;
535
    DWTELEM * buffer;
536

    
537
    assert(line >= 0 && line < buf->line_count);
538
    assert(buf->line[line]);
539

    
540
    offset = buf->line_width * line;
541
    buffer = buf->line[line];
542
    buf->data_stack_top++;
543
    buf->data_stack[buf->data_stack_top] = buffer;
544
    buf->line[line] = NULL;
545

    
546
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
547
}
548

    
549
static void slice_buffer_flush(slice_buffer * buf)
550
{
551
    int i;
552
    for (i = 0; i < buf->line_count; i++)
553
    {
554
        if (buf->line[i])
555
        {
556
//      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
557
            slice_buffer_release(buf, i);
558
        }
559
    }
560
}
561

    
562
static void slice_buffer_destroy(slice_buffer * buf)
563
{
564
    int i;
565
    slice_buffer_flush(buf);
566

    
567
    for (i = buf->data_count - 1; i >= 0; i--)
568
    {
569
        assert(buf->data_stack[i]);
570
        av_freep(&buf->data_stack[i]);
571
    }
572
    assert(buf->data_stack);
573
    av_freep(&buf->data_stack);
574
    assert(buf->line);
575
    av_freep(&buf->line);
576
}
577

    
578
#ifdef __sgi
579
// Avoid a name clash on SGI IRIX
580
#undef qexp
581
#endif
582
#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
583
static uint8_t qexp[QROOT];
584

    
585
static inline int mirror(int v, int m){
586
    while((unsigned)v > (unsigned)m){
587
        v=-v;
588
        if(v<0) v+= 2*m;
589
    }
590
    return v;
591
}
592

    
593
static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
594
    int i;
595

    
596
    if(v){
597
        const int a= FFABS(v);
598
        const int e= av_log2(a);
599
#if 1
600
        const int el= FFMIN(e, 10);
601
        put_rac(c, state+0, 0);
602

    
603
        for(i=0; i<el; i++){
604
            put_rac(c, state+1+i, 1);  //1..10
605
        }
606
        for(; i<e; i++){
607
            put_rac(c, state+1+9, 1);  //1..10
608
        }
609
        put_rac(c, state+1+FFMIN(i,9), 0);
610

    
611
        for(i=e-1; i>=el; i--){
612
            put_rac(c, state+22+9, (a>>i)&1); //22..31
613
        }
614
        for(; i>=0; i--){
615
            put_rac(c, state+22+i, (a>>i)&1); //22..31
616
        }
617

    
618
        if(is_signed)
619
            put_rac(c, state+11 + el, v < 0); //11..21
620
#else
621

    
622
        put_rac(c, state+0, 0);
623
        if(e<=9){
624
            for(i=0; i<e; i++){
625
                put_rac(c, state+1+i, 1);  //1..10
626
            }
627
            put_rac(c, state+1+i, 0);
628

    
629
            for(i=e-1; i>=0; i--){
630
                put_rac(c, state+22+i, (a>>i)&1); //22..31
631
            }
632

    
633
            if(is_signed)
634
                put_rac(c, state+11 + e, v < 0); //11..21
635
        }else{
636
            for(i=0; i<e; i++){
637
                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
638
            }
639
            put_rac(c, state+1+FFMIN(i,9), 0);
640

    
641
            for(i=e-1; i>=0; i--){
642
                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
643
            }
644

    
645
            if(is_signed)
646
                put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
647
        }
648
#endif
649
    }else{
650
        put_rac(c, state+0, 1);
651
    }
652
}
653

    
654
static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
655
    if(get_rac(c, state+0))
656
        return 0;
657
    else{
658
        int i, e, a;
659
        e= 0;
660
        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
661
            e++;
662
        }
663

    
664
        a= 1;
665
        for(i=e-1; i>=0; i--){
666
            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
667
        }
668

    
669
        if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
670
            return -a;
671
        else
672
            return a;
673
    }
674
}
675

    
676
static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
677
    int i;
678
    int r= log2>=0 ? 1<<log2 : 1;
679

    
680
    assert(v>=0);
681
    assert(log2>=-4);
682

    
683
    while(v >= r){
684
        put_rac(c, state+4+log2, 1);
685
        v -= r;
686
        log2++;
687
        if(log2>0) r+=r;
688
    }
689
    put_rac(c, state+4+log2, 0);
690

    
691
    for(i=log2-1; i>=0; i--){
692
        put_rac(c, state+31-i, (v>>i)&1);
693
    }
694
}
695

    
696
static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
697
    int i;
698
    int r= log2>=0 ? 1<<log2 : 1;
699
    int v=0;
700

    
701
    assert(log2>=-4);
702

    
703
    while(get_rac(c, state+4+log2)){
704
        v+= r;
705
        log2++;
706
        if(log2>0) r+=r;
707
    }
708

    
709
    for(i=log2-1; i>=0; i--){
710
        v+= get_rac(c, state+31-i)<<i;
711
    }
712

    
713
    return v;
714
}
715

    
716
static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
717
    const int mirror_left= !highpass;
718
    const int mirror_right= (width&1) ^ highpass;
719
    const int w= (width>>1) - 1 + (highpass & width);
720
    int i;
721

    
722
#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
723
    if(mirror_left){
724
        dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
725
        dst += dst_step;
726
        src += src_step;
727
    }
728

    
729
    for(i=0; i<w; i++){
730
        dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
731
    }
732

    
733
    if(mirror_right){
734
        dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
735
    }
736
}
737

    
738
#ifndef lift5
739
static av_always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
740
    const int mirror_left= !highpass;
741
    const int mirror_right= (width&1) ^ highpass;
742
    const int w= (width>>1) - 1 + (highpass & width);
743
    int i;
744

    
745
    if(mirror_left){
746
        int r= 3*2*ref[0];
747
        r += r>>4;
748
        r += r>>8;
749
        dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
750
        dst += dst_step;
751
        src += src_step;
752
    }
753

    
754
    for(i=0; i<w; i++){
755
        int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
756
        r += r>>4;
757
        r += r>>8;
758
        dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
759
    }
760

    
761
    if(mirror_right){
762
        int r= 3*2*ref[w*ref_step];
763
        r += r>>4;
764
        r += r>>8;
765
        dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
766
    }
767
}
768
#endif
769

    
770
#ifndef liftS
771
static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
772
    const int mirror_left= !highpass;
773
    const int mirror_right= (width&1) ^ highpass;
774
    const int w= (width>>1) - 1 + (highpass & width);
775
    int i;
776

    
777
    assert(shift == 4);
778
#define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
779
    if(mirror_left){
780
        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
781
        dst += dst_step;
782
        src += src_step;
783
    }
784

    
785
    for(i=0; i<w; i++){
786
        dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
787
    }
788

    
789
    if(mirror_right){
790
        dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
791
    }
792
}
793
#endif
794

    
795

    
796
static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
797
    int x, i;
798

    
799
    for(x=start; x<width; x+=2){
800
        int64_t sum=0;
801

    
802
        for(i=0; i<n; i++){
803
            int x2= x + 2*i - n + 1;
804
            if     (x2<     0) x2= -x2;
805
            else if(x2>=width) x2= 2*width-x2-2;
806
            sum += coeffs[i]*(int64_t)dst[x2];
807
        }
808
        if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
809
        else        dst[x] += (sum + (1<<shift)/2)>>shift;
810
    }
811
}
812

    
813
static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
814
    int x, y, i;
815
    for(y=start; y<height; y+=2){
816
        for(x=0; x<width; x++){
817
            int64_t sum=0;
818

    
819
            for(i=0; i<n; i++){
820
                int y2= y + 2*i - n + 1;
821
                if     (y2<      0) y2= -y2;
822
                else if(y2>=height) y2= 2*height-y2-2;
823
                sum += coeffs[i]*(int64_t)dst[x + y2*stride];
824
            }
825
            if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
826
            else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
827
        }
828
    }
829
}
830

    
831
#define SCALEX 1
832
#define LX0 0
833
#define LX1 1
834

    
835
#if 0 // more accurate 9/7
836
#define N1 2
837
#define SHIFT1 14
838
#define COEFFS1 (int[]){-25987,-25987}
839
#define N2 2
840
#define SHIFT2 19
841
#define COEFFS2 (int[]){-27777,-27777}
842
#define N3 2
843
#define SHIFT3 15
844
#define COEFFS3 (int[]){28931,28931}
845
#define N4 2
846
#define SHIFT4 15
847
#define COEFFS4 (int[]){14533,14533}
848
#elif 1 // 13/7 CRF
849
#define N1 4
850
#define SHIFT1 4
851
#define COEFFS1 (int[]){1,-9,-9,1}
852
#define N2 4
853
#define SHIFT2 4
854
#define COEFFS2 (int[]){-1,5,5,-1}
855
#define N3 0
856
#define SHIFT3 1
857
#define COEFFS3 NULL
858
#define N4 0
859
#define SHIFT4 1
860
#define COEFFS4 NULL
861
#elif 1 // 3/5
862
#define LX0 1
863
#define LX1 0
864
#define SCALEX 0.5
865
#define N1 2
866
#define SHIFT1 1
867
#define COEFFS1 (int[]){1,1}
868
#define N2 2
869
#define SHIFT2 2
870
#define COEFFS2 (int[]){-1,-1}
871
#define N3 0
872
#define SHIFT3 0
873
#define COEFFS3 NULL
874
#define N4 0
875
#define SHIFT4 0
876
#define COEFFS4 NULL
877
#elif 1 // 11/5
878
#define N1 0
879
#define SHIFT1 1
880
#define COEFFS1 NULL
881
#define N2 2
882
#define SHIFT2 2
883
#define COEFFS2 (int[]){-1,-1}
884
#define N3 2
885
#define SHIFT3 0
886
#define COEFFS3 (int[]){-1,-1}
887
#define N4 4
888
#define SHIFT4 7
889
#define COEFFS4 (int[]){-5,29,29,-5}
890
#define SCALEX 4
891
#elif 1 // 9/7 CDF
892
#define N1 2
893
#define SHIFT1 7
894
#define COEFFS1 (int[]){-203,-203}
895
#define N2 2
896
#define SHIFT2 12
897
#define COEFFS2 (int[]){-217,-217}
898
#define N3 2
899
#define SHIFT3 7
900
#define COEFFS3 (int[]){113,113}
901
#define N4 2
902
#define SHIFT4 9
903
#define COEFFS4 (int[]){227,227}
904
#define SCALEX 1
905
#elif 1 // 7/5 CDF
906
#define N1 0
907
#define SHIFT1 1
908
#define COEFFS1 NULL
909
#define N2 2
910
#define SHIFT2 2
911
#define COEFFS2 (int[]){-1,-1}
912
#define N3 2
913
#define SHIFT3 0
914
#define COEFFS3 (int[]){-1,-1}
915
#define N4 2
916
#define SHIFT4 4
917
#define COEFFS4 (int[]){3,3}
918
#elif 1 // 9/7 MN
919
#define N1 4
920
#define SHIFT1 4
921
#define COEFFS1 (int[]){1,-9,-9,1}
922
#define N2 2
923
#define SHIFT2 2
924
#define COEFFS2 (int[]){1,1}
925
#define N3 0
926
#define SHIFT3 1
927
#define COEFFS3 NULL
928
#define N4 0
929
#define SHIFT4 1
930
#define COEFFS4 NULL
931
#else // 13/7 CRF
932
#define N1 4
933
#define SHIFT1 4
934
#define COEFFS1 (int[]){1,-9,-9,1}
935
#define N2 4
936
#define SHIFT2 4
937
#define COEFFS2 (int[]){-1,5,5,-1}
938
#define N3 0
939
#define SHIFT3 1
940
#define COEFFS3 NULL
941
#define N4 0
942
#define SHIFT4 1
943
#define COEFFS4 NULL
944
#endif
945
static void horizontal_decomposeX(DWTELEM *b, int width){
946
    DWTELEM temp[width];
947
    const int width2= width>>1;
948
    const int w2= (width+1)>>1;
949
    int x;
950

    
951
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
952
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
953
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
954
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
955

    
956
    for(x=0; x<width2; x++){
957
        temp[x   ]= b[2*x    ];
958
        temp[x+w2]= b[2*x + 1];
959
    }
960
    if(width&1)
961
        temp[x   ]= b[2*x    ];
962
    memcpy(b, temp, width*sizeof(int));
963
}
964

    
965
static void horizontal_composeX(DWTELEM *b, int width){
966
    DWTELEM temp[width];
967
    const int width2= width>>1;
968
    int x;
969
    const int w2= (width+1)>>1;
970

    
971
    memcpy(temp, b, width*sizeof(int));
972
    for(x=0; x<width2; x++){
973
        b[2*x    ]= temp[x   ];
974
        b[2*x + 1]= temp[x+w2];
975
    }
976
    if(width&1)
977
        b[2*x    ]= temp[x   ];
978

    
979
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
980
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
981
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
982
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
983
}
984

    
985
static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
986
    int x, y;
987

    
988
    for(y=0; y<height; y++){
989
        for(x=0; x<width; x++){
990
            buffer[y*stride + x] *= SCALEX;
991
        }
992
    }
993

    
994
    for(y=0; y<height; y++){
995
        horizontal_decomposeX(buffer + y*stride, width);
996
    }
997

    
998
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
999
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
1000
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
1001
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
1002
}
1003

    
1004
static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
1005
    int x, y;
1006

    
1007
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1008
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1009
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1010
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1011

    
1012
    for(y=0; y<height; y++){
1013
        horizontal_composeX(buffer + y*stride, width);
1014
    }
1015

    
1016
    for(y=0; y<height; y++){
1017
        for(x=0; x<width; x++){
1018
            buffer[y*stride + x] /= SCALEX;
1019
        }
1020
    }
1021
}
1022

    
1023
static void horizontal_decompose53i(DWTELEM *b, int width){
1024
    DWTELEM temp[width];
1025
    const int width2= width>>1;
1026
    int x;
1027
    const int w2= (width+1)>>1;
1028

    
1029
    for(x=0; x<width2; x++){
1030
        temp[x   ]= b[2*x    ];
1031
        temp[x+w2]= b[2*x + 1];
1032
    }
1033
    if(width&1)
1034
        temp[x   ]= b[2*x    ];
1035
#if 0
1036
    {
1037
    int A1,A2,A3,A4;
1038
    A2= temp[1       ];
1039
    A4= temp[0       ];
1040
    A1= temp[0+width2];
1041
    A1 -= (A2 + A4)>>1;
1042
    A4 += (A1 + 1)>>1;
1043
    b[0+width2] = A1;
1044
    b[0       ] = A4;
1045
    for(x=1; x+1<width2; x+=2){
1046
        A3= temp[x+width2];
1047
        A4= temp[x+1     ];
1048
        A3 -= (A2 + A4)>>1;
1049
        A2 += (A1 + A3 + 2)>>2;
1050
        b[x+width2] = A3;
1051
        b[x       ] = A2;
1052

1053
        A1= temp[x+1+width2];
1054
        A2= temp[x+2       ];
1055
        A1 -= (A2 + A4)>>1;
1056
        A4 += (A1 + A3 + 2)>>2;
1057
        b[x+1+width2] = A1;
1058
        b[x+1       ] = A4;
1059
    }
1060
    A3= temp[width-1];
1061
    A3 -= A2;
1062
    A2 += (A1 + A3 + 2)>>2;
1063
    b[width -1] = A3;
1064
    b[width2-1] = A2;
1065
    }
1066
#else
1067
    lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1068
    lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1069
#endif
1070
}
1071

    
1072
static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1073
    int i;
1074

    
1075
    for(i=0; i<width; i++){
1076
        b1[i] -= (b0[i] + b2[i])>>1;
1077
    }
1078
}
1079

    
1080
static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1081
    int i;
1082

    
1083
    for(i=0; i<width; i++){
1084
        b1[i] += (b0[i] + b2[i] + 2)>>2;
1085
    }
1086
}
1087

    
1088
static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1089
    int y;
1090
    DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1091
    DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1092

    
1093
    for(y=-2; y<height; y+=2){
1094
        DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1095
        DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1096

    
1097
{START_TIMER
1098
        if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1099
        if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1100
STOP_TIMER("horizontal_decompose53i")}
1101

    
1102
{START_TIMER
1103
        if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1104
        if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1105
STOP_TIMER("vertical_decompose53i*")}
1106

    
1107
        b0=b2;
1108
        b1=b3;
1109
    }
1110
}
1111

    
1112
static void horizontal_decompose97i(DWTELEM *b, int width){
1113
    DWTELEM temp[width];
1114
    const int w2= (width+1)>>1;
1115

    
1116
    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1117
    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1118
    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1119
    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1120
}
1121

    
1122

    
1123
static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1124
    int i;
1125

    
1126
    for(i=0; i<width; i++){
1127
        b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1128
    }
1129
}
1130

    
1131
static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1132
    int i;
1133

    
1134
    for(i=0; i<width; i++){
1135
#ifdef lift5
1136
        b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1137
#else
1138
        int r= 3*(b0[i] + b2[i]);
1139
        r+= r>>4;
1140
        r+= r>>8;
1141
        b1[i] += (r+W_CO)>>W_CS;
1142
#endif
1143
    }
1144
}
1145

    
1146
static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1147
    int i;
1148

    
1149
    for(i=0; i<width; i++){
1150
#ifdef liftS
1151
        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1152
#else
1153
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1154
#endif
1155
    }
1156
}
1157

    
1158
static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1159
    int i;
1160

    
1161
    for(i=0; i<width; i++){
1162
        b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1163
    }
1164
}
1165

    
1166
static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1167
    int y;
1168
    DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1169
    DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1170
    DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1171
    DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1172

    
1173
    for(y=-4; y<height; y+=2){
1174
        DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1175
        DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1176

    
1177
{START_TIMER
1178
        if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1179
        if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1180
if(width>400){
1181
STOP_TIMER("horizontal_decompose97i")
1182
}}
1183

    
1184
{START_TIMER
1185
        if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1186
        if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1187
        if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1188
        if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1189

    
1190
if(width>400){
1191
STOP_TIMER("vertical_decompose97i")
1192
}}
1193

    
1194
        b0=b2;
1195
        b1=b3;
1196
        b2=b4;
1197
        b3=b5;
1198
    }
1199
}
1200

    
1201
void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1202
    int level;
1203

    
1204
    for(level=0; level<decomposition_count; level++){
1205
        switch(type){
1206
        case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1207
        case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1208
        case DWT_X: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1209
        }
1210
    }
1211
}
1212

    
1213
static void horizontal_compose53i(DWTELEM *b, int width){
1214
    DWTELEM temp[width];
1215
    const int width2= width>>1;
1216
    const int w2= (width+1)>>1;
1217
    int x;
1218

    
1219
#if 0
1220
    int A1,A2,A3,A4;
1221
    A2= temp[1       ];
1222
    A4= temp[0       ];
1223
    A1= temp[0+width2];
1224
    A1 -= (A2 + A4)>>1;
1225
    A4 += (A1 + 1)>>1;
1226
    b[0+width2] = A1;
1227
    b[0       ] = A4;
1228
    for(x=1; x+1<width2; x+=2){
1229
        A3= temp[x+width2];
1230
        A4= temp[x+1     ];
1231
        A3 -= (A2 + A4)>>1;
1232
        A2 += (A1 + A3 + 2)>>2;
1233
        b[x+width2] = A3;
1234
        b[x       ] = A2;
1235

1236
        A1= temp[x+1+width2];
1237
        A2= temp[x+2       ];
1238
        A1 -= (A2 + A4)>>1;
1239
        A4 += (A1 + A3 + 2)>>2;
1240
        b[x+1+width2] = A1;
1241
        b[x+1       ] = A4;
1242
    }
1243
    A3= temp[width-1];
1244
    A3 -= A2;
1245
    A2 += (A1 + A3 + 2)>>2;
1246
    b[width -1] = A3;
1247
    b[width2-1] = A2;
1248
#else
1249
    lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1250
    lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1251
#endif
1252
    for(x=0; x<width2; x++){
1253
        b[2*x    ]= temp[x   ];
1254
        b[2*x + 1]= temp[x+w2];
1255
    }
1256
    if(width&1)
1257
        b[2*x    ]= temp[x   ];
1258
}
1259

    
1260
static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1261
    int i;
1262

    
1263
    for(i=0; i<width; i++){
1264
        b1[i] += (b0[i] + b2[i])>>1;
1265
    }
1266
}
1267

    
1268
static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1269
    int i;
1270

    
1271
    for(i=0; i<width; i++){
1272
        b1[i] -= (b0[i] + b2[i] + 2)>>2;
1273
    }
1274
}
1275

    
1276
static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1277
    cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1278
    cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1279
    cs->y = -1;
1280
}
1281

    
1282
static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1283
    cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1284
    cs->b1 = buffer + mirror(-1  , height-1)*stride;
1285
    cs->y = -1;
1286
}
1287

    
1288
static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1289
    int y= cs->y;
1290

    
1291
    DWTELEM *b0= cs->b0;
1292
    DWTELEM *b1= cs->b1;
1293
    DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1294
    DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1295

    
1296
{START_TIMER
1297
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1298
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1299
STOP_TIMER("vertical_compose53i*")}
1300

    
1301
{START_TIMER
1302
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1303
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1304
STOP_TIMER("horizontal_compose53i")}
1305

    
1306
    cs->b0 = b2;
1307
    cs->b1 = b3;
1308
    cs->y += 2;
1309
}
1310

    
1311
static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1312
    int y= cs->y;
1313
    DWTELEM *b0= cs->b0;
1314
    DWTELEM *b1= cs->b1;
1315
    DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1316
    DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1317

    
1318
{START_TIMER
1319
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1320
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1321
STOP_TIMER("vertical_compose53i*")}
1322

    
1323
{START_TIMER
1324
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1325
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1326
STOP_TIMER("horizontal_compose53i")}
1327

    
1328
    cs->b0 = b2;
1329
    cs->b1 = b3;
1330
    cs->y += 2;
1331
}
1332

    
1333
static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1334
    dwt_compose_t cs;
1335
    spatial_compose53i_init(&cs, buffer, height, stride);
1336
    while(cs.y <= height)
1337
        spatial_compose53i_dy(&cs, buffer, width, height, stride);
1338
}
1339

    
1340

    
1341
void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1342
    DWTELEM temp[width];
1343
    const int w2= (width+1)>>1;
1344

    
1345
    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1346
    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1347
    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1348
    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1349
}
1350

    
1351
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1352
    int i;
1353

    
1354
    for(i=0; i<width; i++){
1355
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1356
    }
1357
}
1358

    
1359
static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1360
    int i;
1361

    
1362
    for(i=0; i<width; i++){
1363
#ifdef lift5
1364
        b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1365
#else
1366
        int r= 3*(b0[i] + b2[i]);
1367
        r+= r>>4;
1368
        r+= r>>8;
1369
        b1[i] -= (r+W_CO)>>W_CS;
1370
#endif
1371
    }
1372
}
1373

    
1374
static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1375
    int i;
1376

    
1377
    for(i=0; i<width; i++){
1378
#ifdef liftS
1379
        b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1380
#else
1381
        b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1382
#endif
1383
    }
1384
}
1385

    
1386
static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1387
    int i;
1388

    
1389
    for(i=0; i<width; i++){
1390
        b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1391
    }
1392
}
1393

    
1394
void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1395
    int i;
1396

    
1397
    for(i=0; i<width; i++){
1398
#ifndef lift5
1399
        int r;
1400
#endif
1401
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1402
#ifdef lift5
1403
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1404
#else
1405
        r= 3*(b2[i] + b4[i]);
1406
        r+= r>>4;
1407
        r+= r>>8;
1408
        b3[i] -= (r+W_CO)>>W_CS;
1409
#endif
1410
#ifdef liftS
1411
        b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1412
#else
1413
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1414
#endif
1415
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1416
    }
1417
}
1418

    
1419
static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1420
    cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1421
    cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1422
    cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1423
    cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1424
    cs->y = -3;
1425
}
1426

    
1427
static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1428
    cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1429
    cs->b1 = buffer + mirror(-3  , height-1)*stride;
1430
    cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1431
    cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1432
    cs->y = -3;
1433
}
1434

    
1435
static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1436
    int y = cs->y;
1437

    
1438
    DWTELEM *b0= cs->b0;
1439
    DWTELEM *b1= cs->b1;
1440
    DWTELEM *b2= cs->b2;
1441
    DWTELEM *b3= cs->b3;
1442
    DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1443
    DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1444

    
1445
{START_TIMER
1446
    if(y>0 && y+4<height){
1447
        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1448
    }else{
1449
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1450
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1451
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1452
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1453
    }
1454
if(width>400){
1455
STOP_TIMER("vertical_compose97i")}}
1456

    
1457
{START_TIMER
1458
        if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1459
        if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1460
if(width>400 && y+0<(unsigned)height){
1461
STOP_TIMER("horizontal_compose97i")}}
1462

    
1463
    cs->b0=b2;
1464
    cs->b1=b3;
1465
    cs->b2=b4;
1466
    cs->b3=b5;
1467
    cs->y += 2;
1468
}
1469

    
1470
static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1471
    int y = cs->y;
1472
    DWTELEM *b0= cs->b0;
1473
    DWTELEM *b1= cs->b1;
1474
    DWTELEM *b2= cs->b2;
1475
    DWTELEM *b3= cs->b3;
1476
    DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1477
    DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1478

    
1479
{START_TIMER
1480
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1481
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1482
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1483
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1484
if(width>400){
1485
STOP_TIMER("vertical_compose97i")}}
1486

    
1487
{START_TIMER
1488
        if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1489
        if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1490
if(width>400 && b0 <= b2){
1491
STOP_TIMER("horizontal_compose97i")}}
1492

    
1493
    cs->b0=b2;
1494
    cs->b1=b3;
1495
    cs->b2=b4;
1496
    cs->b3=b5;
1497
    cs->y += 2;
1498
}
1499

    
1500
static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1501
    dwt_compose_t cs;
1502
    spatial_compose97i_init(&cs, buffer, height, stride);
1503
    while(cs.y <= height)
1504
        spatial_compose97i_dy(&cs, buffer, width, height, stride);
1505
}
1506

    
1507
static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1508
    int level;
1509
    for(level=decomposition_count-1; level>=0; level--){
1510
        switch(type){
1511
        case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1512
        case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1513
        /* not slicified yet */
1514
        case DWT_X: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1515
          av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1516
        }
1517
    }
1518
}
1519

    
1520
static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1521
    int level;
1522
    for(level=decomposition_count-1; level>=0; level--){
1523
        switch(type){
1524
        case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1525
        case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1526
        /* not slicified yet */
1527
        case DWT_X: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1528
        }
1529
    }
1530
}
1531

    
1532
static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1533
    const int support = type==1 ? 3 : 5;
1534
    int level;
1535
    if(type==2) return;
1536

    
1537
    for(level=decomposition_count-1; level>=0; level--){
1538
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1539
            switch(type){
1540
            case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1541
                    break;
1542
            case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1543
                    break;
1544
            case DWT_X: break;
1545
            }
1546
        }
1547
    }
1548
}
1549

    
1550
static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1551
    const int support = type==1 ? 3 : 5;
1552
    int level;
1553
    if(type==2) return;
1554

    
1555
    for(level=decomposition_count-1; level>=0; level--){
1556
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1557
            switch(type){
1558
            case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1559
                    break;
1560
            case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1561
                    break;
1562
            case DWT_X: break;
1563
            }
1564
        }
1565
    }
1566
}
1567

    
1568
static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1569
    if(type==2){
1570
        int level;
1571
        for(level=decomposition_count-1; level>=0; level--)
1572
            spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1573
    }else{
1574
        dwt_compose_t cs[MAX_DECOMPOSITIONS];
1575
        int y;
1576
        ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1577
        for(y=0; y<height; y+=4)
1578
            ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1579
    }
1580
}
1581

    
1582
static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1583
    const int w= b->width;
1584
    const int h= b->height;
1585
    int x, y;
1586

    
1587
    if(1){
1588
        int run=0;
1589
        int runs[w*h];
1590
        int run_index=0;
1591
        int max_index;
1592

    
1593
        for(y=0; y<h; y++){
1594
            for(x=0; x<w; x++){
1595
                int v, p=0;
1596
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1597
                v= src[x + y*stride];
1598

    
1599
                if(y){
1600
                    t= src[x + (y-1)*stride];
1601
                    if(x){
1602
                        lt= src[x - 1 + (y-1)*stride];
1603
                    }
1604
                    if(x + 1 < w){
1605
                        rt= src[x + 1 + (y-1)*stride];
1606
                    }
1607
                }
1608
                if(x){
1609
                    l= src[x - 1 + y*stride];
1610
                    /*if(x > 1){
1611
                        if(orientation==1) ll= src[y + (x-2)*stride];
1612
                        else               ll= src[x - 2 + y*stride];
1613
                    }*/
1614
                }
1615
                if(parent){
1616
                    int px= x>>1;
1617
                    int py= y>>1;
1618
                    if(px<b->parent->width && py<b->parent->height)
1619
                        p= parent[px + py*2*stride];
1620
                }
1621
                if(!(/*ll|*/l|lt|t|rt|p)){
1622
                    if(v){
1623
                        runs[run_index++]= run;
1624
                        run=0;
1625
                    }else{
1626
                        run++;
1627
                    }
1628
                }
1629
            }
1630
        }
1631
        max_index= run_index;
1632
        runs[run_index++]= run;
1633
        run_index=0;
1634
        run= runs[run_index++];
1635

    
1636
        put_symbol2(&s->c, b->state[30], max_index, 0);
1637
        if(run_index <= max_index)
1638
            put_symbol2(&s->c, b->state[1], run, 3);
1639

    
1640
        for(y=0; y<h; y++){
1641
            if(s->c.bytestream_end - s->c.bytestream < w*40){
1642
                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1643
                return -1;
1644
            }
1645
            for(x=0; x<w; x++){
1646
                int v, p=0;
1647
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1648
                v= src[x + y*stride];
1649

    
1650
                if(y){
1651
                    t= src[x + (y-1)*stride];
1652
                    if(x){
1653
                        lt= src[x - 1 + (y-1)*stride];
1654
                    }
1655
                    if(x + 1 < w){
1656
                        rt= src[x + 1 + (y-1)*stride];
1657
                    }
1658
                }
1659
                if(x){
1660
                    l= src[x - 1 + y*stride];
1661
                    /*if(x > 1){
1662
                        if(orientation==1) ll= src[y + (x-2)*stride];
1663
                        else               ll= src[x - 2 + y*stride];
1664
                    }*/
1665
                }
1666
                if(parent){
1667
                    int px= x>>1;
1668
                    int py= y>>1;
1669
                    if(px<b->parent->width && py<b->parent->height)
1670
                        p= parent[px + py*2*stride];
1671
                }
1672
                if(/*ll|*/l|lt|t|rt|p){
1673
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1674

    
1675
                    put_rac(&s->c, &b->state[0][context], !!v);
1676
                }else{
1677
                    if(!run){
1678
                        run= runs[run_index++];
1679

    
1680
                        if(run_index <= max_index)
1681
                            put_symbol2(&s->c, b->state[1], run, 3);
1682
                        assert(v);
1683
                    }else{
1684
                        run--;
1685
                        assert(!v);
1686
                    }
1687
                }
1688
                if(v){
1689
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1690
                    int l2= 2*FFABS(l) + (l<0);
1691
                    int t2= 2*FFABS(t) + (t<0);
1692

    
1693
                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1694
                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1695
                }
1696
            }
1697
        }
1698
    }
1699
    return 0;
1700
}
1701

    
1702
static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1703
//    encode_subband_qtree(s, b, src, parent, stride, orientation);
1704
//    encode_subband_z0run(s, b, src, parent, stride, orientation);
1705
    return encode_subband_c0run(s, b, src, parent, stride, orientation);
1706
//    encode_subband_dzr(s, b, src, parent, stride, orientation);
1707
}
1708

    
1709
static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1710
    const int w= b->width;
1711
    const int h= b->height;
1712
    int x,y;
1713

    
1714
    if(1){
1715
        int run, runs;
1716
        x_and_coeff *xc= b->x_coeff;
1717
        x_and_coeff *prev_xc= NULL;
1718
        x_and_coeff *prev2_xc= xc;
1719
        x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1720
        x_and_coeff *prev_parent_xc= parent_xc;
1721

    
1722
        runs= get_symbol2(&s->c, b->state[30], 0);
1723
        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1724
        else           run= INT_MAX;
1725

    
1726
        for(y=0; y<h; y++){
1727
            int v=0;
1728
            int lt=0, t=0, rt=0;
1729

    
1730
            if(y && prev_xc->x == 0){
1731
                rt= prev_xc->coeff;
1732
            }
1733
            for(x=0; x<w; x++){
1734
                int p=0;
1735
                const int l= v;
1736

    
1737
                lt= t; t= rt;
1738

    
1739
                if(y){
1740
                    if(prev_xc->x <= x)
1741
                        prev_xc++;
1742
                    if(prev_xc->x == x + 1)
1743
                        rt= prev_xc->coeff;
1744
                    else
1745
                        rt=0;
1746
                }
1747
                if(parent_xc){
1748
                    if(x>>1 > parent_xc->x){
1749
                        parent_xc++;
1750
                    }
1751
                    if(x>>1 == parent_xc->x){
1752
                        p= parent_xc->coeff;
1753
                    }
1754
                }
1755
                if(/*ll|*/l|lt|t|rt|p){
1756
                    int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1757

    
1758
                    v=get_rac(&s->c, &b->state[0][context]);
1759
                    if(v){
1760
                        v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1761
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1762

    
1763
                        xc->x=x;
1764
                        (xc++)->coeff= v;
1765
                    }
1766
                }else{
1767
                    if(!run){
1768
                        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1769
                        else           run= INT_MAX;
1770
                        v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1771
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1772

    
1773
                        xc->x=x;
1774
                        (xc++)->coeff= v;
1775
                    }else{
1776
                        int max_run;
1777
                        run--;
1778
                        v=0;
1779

    
1780
                        if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1781
                        else  max_run= FFMIN(run, w-x-1);
1782
                        if(parent_xc)
1783
                            max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1784
                        x+= max_run;
1785
                        run-= max_run;
1786
                    }
1787
                }
1788
            }
1789
            (xc++)->x= w+1; //end marker
1790
            prev_xc= prev2_xc;
1791
            prev2_xc= xc;
1792

    
1793
            if(parent_xc){
1794
                if(y&1){
1795
                    while(parent_xc->x != parent->width+1)
1796
                        parent_xc++;
1797
                    parent_xc++;
1798
                    prev_parent_xc= parent_xc;
1799
                }else{
1800
                    parent_xc= prev_parent_xc;
1801
                }
1802
            }
1803
        }
1804

    
1805
        (xc++)->x= w+1; //end marker
1806
    }
1807
}
1808

    
1809
static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1810
    const int w= b->width;
1811
    int y;
1812
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1813
    int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1814
    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1815
    int new_index = 0;
1816

    
1817
    START_TIMER
1818

    
1819
    if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1820
        qadd= 0;
1821
        qmul= 1<<QEXPSHIFT;
1822
    }
1823

    
1824
    /* If we are on the second or later slice, restore our index. */
1825
    if (start_y != 0)
1826
        new_index = save_state[0];
1827

    
1828

    
1829
    for(y=start_y; y<h; y++){
1830
        int x = 0;
1831
        int v;
1832
        DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1833
        memset(line, 0, b->width*sizeof(DWTELEM));
1834
        v = b->x_coeff[new_index].coeff;
1835
        x = b->x_coeff[new_index++].x;
1836
        while(x < w)
1837
        {
1838
            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1839
            register int u= -(v&1);
1840
            line[x] = (t^u) - u;
1841

    
1842
            v = b->x_coeff[new_index].coeff;
1843
            x = b->x_coeff[new_index++].x;
1844
        }
1845
    }
1846
    if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1847
        STOP_TIMER("decode_subband")
1848
    }
1849

    
1850
    /* Save our variables for the next slice. */
1851
    save_state[0] = new_index;
1852

    
1853
    return;
1854
}
1855

    
1856
static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1857
    int plane_index, level, orientation;
1858

    
1859
    for(plane_index=0; plane_index<3; plane_index++){
1860
        for(level=0; level<s->spatial_decomposition_count; level++){
1861
            for(orientation=level ? 1:0; orientation<4; orientation++){
1862
                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1863
            }
1864
        }
1865
    }
1866
    memset(s->header_state, MID_STATE, sizeof(s->header_state));
1867
    memset(s->block_state, MID_STATE, sizeof(s->block_state));
1868
}
1869

    
1870
static int alloc_blocks(SnowContext *s){
1871
    int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1872
    int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1873

    
1874
    s->b_width = w;
1875
    s->b_height= h;
1876

    
1877
    s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1878
    return 0;
1879
}
1880

    
1881
static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1882
    uint8_t *bytestream= d->bytestream;
1883
    uint8_t *bytestream_start= d->bytestream_start;
1884
    *d= *s;
1885
    d->bytestream= bytestream;
1886
    d->bytestream_start= bytestream_start;
1887
}
1888

    
1889
//near copy & paste from dsputil, FIXME
1890
static int pix_sum(uint8_t * pix, int line_size, int w)
1891
{
1892
    int s, i, j;
1893

    
1894
    s = 0;
1895
    for (i = 0; i < w; i++) {
1896
        for (j = 0; j < w; j++) {
1897
            s += pix[0];
1898
            pix ++;
1899
        }
1900
        pix += line_size - w;
1901
    }
1902
    return s;
1903
}
1904

    
1905
//near copy & paste from dsputil, FIXME
1906
static int pix_norm1(uint8_t * pix, int line_size, int w)
1907
{
1908
    int s, i, j;
1909
    uint32_t *sq = ff_squareTbl + 256;
1910

    
1911
    s = 0;
1912
    for (i = 0; i < w; i++) {
1913
        for (j = 0; j < w; j ++) {
1914
            s += sq[pix[0]];
1915
            pix ++;
1916
        }
1917
        pix += line_size - w;
1918
    }
1919
    return s;
1920
}
1921

    
1922
static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1923
    const int w= s->b_width << s->block_max_depth;
1924
    const int rem_depth= s->block_max_depth - level;
1925
    const int index= (x + y*w) << rem_depth;
1926
    const int block_w= 1<<rem_depth;
1927
    BlockNode block;
1928
    int i,j;
1929

    
1930
    block.color[0]= l;
1931
    block.color[1]= cb;
1932
    block.color[2]= cr;
1933
    block.mx= mx;
1934
    block.my= my;
1935
    block.ref= ref;
1936
    block.type= type;
1937
    block.level= level;
1938

    
1939
    for(j=0; j<block_w; j++){
1940
        for(i=0; i<block_w; i++){
1941
            s->block[index + i + j*w]= block;
1942
        }
1943
    }
1944
}
1945

    
1946
static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1947
    const int offset[3]= {
1948
          y*c->  stride + x,
1949
        ((y*c->uvstride + x)>>1),
1950
        ((y*c->uvstride + x)>>1),
1951
    };
1952
    int i;
1953
    for(i=0; i<3; i++){
1954
        c->src[0][i]= src [i];
1955
        c->ref[0][i]= ref [i] + offset[i];
1956
    }
1957
    assert(!ref_index);
1958
}
1959

    
1960
static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1961
                           const BlockNode *left, const BlockNode *top, const BlockNode *tr){
1962
    if(s->ref_frames == 1){
1963
        *mx = mid_pred(left->mx, top->mx, tr->mx);
1964
        *my = mid_pred(left->my, top->my, tr->my);
1965
    }else{
1966
        const int *scale = scale_mv_ref[ref];
1967
        *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
1968
                       (top ->mx * scale[top ->ref] + 128) >>8,
1969
                       (tr  ->mx * scale[tr  ->ref] + 128) >>8);
1970
        *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
1971
                       (top ->my * scale[top ->ref] + 128) >>8,
1972
                       (tr  ->my * scale[tr  ->ref] + 128) >>8);
1973
    }
1974
}
1975

    
1976
//FIXME copy&paste
1977
#define P_LEFT P[1]
1978
#define P_TOP P[2]
1979
#define P_TOPRIGHT P[3]
1980
#define P_MEDIAN P[4]
1981
#define P_MV1 P[9]
1982
#define FLAG_QPEL   1 //must be 1
1983

    
1984
static int encode_q_branch(SnowContext *s, int level, int x, int y){
1985
    uint8_t p_buffer[1024];
1986
    uint8_t i_buffer[1024];
1987
    uint8_t p_state[sizeof(s->block_state)];
1988
    uint8_t i_state[sizeof(s->block_state)];
1989
    RangeCoder pc, ic;
1990
    uint8_t *pbbak= s->c.bytestream;
1991
    uint8_t *pbbak_start= s->c.bytestream_start;
1992
    int score, score2, iscore, i_len, p_len, block_s, sum;
1993
    const int w= s->b_width  << s->block_max_depth;
1994
    const int h= s->b_height << s->block_max_depth;
1995
    const int rem_depth= s->block_max_depth - level;
1996
    const int index= (x + y*w) << rem_depth;
1997
    const int block_w= 1<<(LOG2_MB_SIZE - level);
1998
    int trx= (x+1)<<rem_depth;
1999
    int try= (y+1)<<rem_depth;
2000
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2001
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2002
    const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
2003
    const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2004
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2005
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2006
    int pl = left->color[0];
2007
    int pcb= left->color[1];
2008
    int pcr= left->color[2];
2009
    int pmx, pmy;
2010
    int mx=0, my=0;
2011
    int l,cr,cb;
2012
    const int stride= s->current_picture.linesize[0];
2013
    const int uvstride= s->current_picture.linesize[1];
2014
    uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2015
                                s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2016
                                s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2017
    int P[10][2];
2018
    int16_t last_mv[3][2];
2019
    int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2020
    const int shift= 1+qpel;
2021
    MotionEstContext *c= &s->m.me;
2022
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2023
    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2024
    int my_context= av_log2(2*FFABS(left->my - top->my));
2025
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2026
    int ref, best_ref, ref_score, ref_mx, ref_my;
2027

    
2028
    assert(sizeof(s->block_state) >= 256);
2029
    if(s->keyframe){
2030
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2031
        return 0;
2032
    }
2033

    
2034
//    clip predictors / edge ?
2035

    
2036
    P_LEFT[0]= left->mx;
2037
    P_LEFT[1]= left->my;
2038
    P_TOP [0]= top->mx;
2039
    P_TOP [1]= top->my;
2040
    P_TOPRIGHT[0]= tr->mx;
2041
    P_TOPRIGHT[1]= tr->my;
2042

    
2043
    last_mv[0][0]= s->block[index].mx;
2044
    last_mv[0][1]= s->block[index].my;
2045
    last_mv[1][0]= right->mx;
2046
    last_mv[1][1]= right->my;
2047
    last_mv[2][0]= bottom->mx;
2048
    last_mv[2][1]= bottom->my;
2049

    
2050
    s->m.mb_stride=2;
2051
    s->m.mb_x=
2052
    s->m.mb_y= 0;
2053
    c->skip= 0;
2054

    
2055
    assert(c->  stride ==   stride);
2056
    assert(c->uvstride == uvstride);
2057

    
2058
    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2059
    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2060
    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2061
    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2062

    
2063
    c->xmin = - x*block_w - 16+2;
2064
    c->ymin = - y*block_w - 16+2;
2065
    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2066
    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2067

    
2068
    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2069
    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2070
    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2071
    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2072
    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2073
    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2074
    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2075

    
2076
    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2077
    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2078

    
2079
    if (!y) {
2080
        c->pred_x= P_LEFT[0];
2081
        c->pred_y= P_LEFT[1];
2082
    } else {
2083
        c->pred_x = P_MEDIAN[0];
2084
        c->pred_y = P_MEDIAN[1];
2085
    }
2086

    
2087
    score= INT_MAX;
2088
    best_ref= 0;
2089
    for(ref=0; ref<s->ref_frames; ref++){
2090
        init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2091

    
2092
        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2093
                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2094

    
2095
        assert(ref_mx >= c->xmin);
2096
        assert(ref_mx <= c->xmax);
2097
        assert(ref_my >= c->ymin);
2098
        assert(ref_my <= c->ymax);
2099

    
2100
        ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2101
        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2102
        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2103
        if(s->ref_mvs[ref]){
2104
            s->ref_mvs[ref][index][0]= ref_mx;
2105
            s->ref_mvs[ref][index][1]= ref_my;
2106
            s->ref_scores[ref][index]= ref_score;
2107
        }
2108
        if(score > ref_score){
2109
            score= ref_score;
2110
            best_ref= ref;
2111
            mx= ref_mx;
2112
            my= ref_my;
2113
        }
2114
    }
2115
    //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
2116

    
2117
  //  subpel search
2118
    pc= s->c;
2119
    pc.bytestream_start=
2120
    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2121
    memcpy(p_state, s->block_state, sizeof(s->block_state));
2122

    
2123
    if(level!=s->block_max_depth)
2124
        put_rac(&pc, &p_state[4 + s_context], 1);
2125
    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2126
    if(s->ref_frames > 1)
2127
        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2128
    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
2129
    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2130
    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2131
    p_len= pc.bytestream - pc.bytestream_start;
2132
    score += (s->lambda2*(p_len*8
2133
              + (pc.outstanding_count - s->c.outstanding_count)*8
2134
              + (-av_log2(pc.range)    + av_log2(s->c.range))
2135
             ))>>FF_LAMBDA_SHIFT;
2136

    
2137
    block_s= block_w*block_w;
2138
    sum = pix_sum(current_data[0], stride, block_w);
2139
    l= (sum + block_s/2)/block_s;
2140
    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2141

    
2142
    block_s= block_w*block_w>>2;
2143
    sum = pix_sum(current_data[1], uvstride, block_w>>1);
2144
    cb= (sum + block_s/2)/block_s;
2145
//    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2146
    sum = pix_sum(current_data[2], uvstride, block_w>>1);
2147
    cr= (sum + block_s/2)/block_s;
2148
//    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2149

    
2150
    ic= s->c;
2151
    ic.bytestream_start=
2152
    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2153
    memcpy(i_state, s->block_state, sizeof(s->block_state));
2154
    if(level!=s->block_max_depth)
2155
        put_rac(&ic, &i_state[4 + s_context], 1);
2156
    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2157
    put_symbol(&ic, &i_state[32],  l-pl , 1);
2158
    put_symbol(&ic, &i_state[64], cb-pcb, 1);
2159
    put_symbol(&ic, &i_state[96], cr-pcr, 1);
2160
    i_len= ic.bytestream - ic.bytestream_start;
2161
    iscore += (s->lambda2*(i_len*8
2162
              + (ic.outstanding_count - s->c.outstanding_count)*8
2163
              + (-av_log2(ic.range)    + av_log2(s->c.range))
2164
             ))>>FF_LAMBDA_SHIFT;
2165

    
2166
//    assert(score==256*256*256*64-1);
2167
    assert(iscore < 255*255*256 + s->lambda2*10);
2168
    assert(iscore >= 0);
2169
    assert(l>=0 && l<=255);
2170
    assert(pl>=0 && pl<=255);
2171

    
2172
    if(level==0){
2173
        int varc= iscore >> 8;
2174
        int vard= score >> 8;
2175
        if (vard <= 64 || vard < varc)
2176
            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2177
        else
2178
            c->scene_change_score+= s->m.qscale;
2179
    }
2180

    
2181
    if(level!=s->block_max_depth){
2182
        put_rac(&s->c, &s->block_state[4 + s_context], 0);
2183
        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2184
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2185
        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2186
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2187
        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2188

    
2189
        if(score2 < score && score2 < iscore)
2190
            return score2;
2191
    }
2192

    
2193
    if(iscore < score){
2194
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2195
        memcpy(pbbak, i_buffer, i_len);
2196
        s->c= ic;
2197
        s->c.bytestream_start= pbbak_start;
2198
        s->c.bytestream= pbbak + i_len;
2199
        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2200
        memcpy(s->block_state, i_state, sizeof(s->block_state));
2201
        return iscore;
2202
    }else{
2203
        memcpy(pbbak, p_buffer, p_len);
2204
        s->c= pc;
2205
        s->c.bytestream_start= pbbak_start;
2206
        s->c.bytestream= pbbak + p_len;
2207
        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2208
        memcpy(s->block_state, p_state, sizeof(s->block_state));
2209
        return score;
2210
    }
2211
}
2212

    
2213
static av_always_inline int same_block(BlockNode *a, BlockNode *b){
2214
    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2215
        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2216
    }else{
2217
        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2218
    }
2219
}
2220

    
2221
static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2222
    const int w= s->b_width  << s->block_max_depth;
2223
    const int rem_depth= s->block_max_depth - level;
2224
    const int index= (x + y*w) << rem_depth;
2225
    int trx= (x+1)<<rem_depth;
2226
    BlockNode *b= &s->block[index];
2227
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2228
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2229
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2230
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2231
    int pl = left->color[0];
2232
    int pcb= left->color[1];
2233
    int pcr= left->color[2];
2234
    int pmx, pmy;
2235
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2236
    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2237
    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2238
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2239

    
2240
    if(s->keyframe){
2241
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2242
        return;
2243
    }
2244

    
2245
    if(level!=s->block_max_depth){
2246
        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2247
            put_rac(&s->c, &s->block_state[4 + s_context], 1);
2248
        }else{
2249
            put_rac(&s->c, &s->block_state[4 + s_context], 0);
2250
            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2251
            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2252
            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2253
            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2254
            return;
2255
        }
2256
    }
2257
    if(b->type & BLOCK_INTRA){
2258
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2259
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2260
        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2261
        put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2262
        put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2263
        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2264
    }else{
2265
        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2266
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2267
        if(s->ref_frames > 1)
2268
            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2269
        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2270
        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2271
        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2272
    }
2273
}
2274

    
2275
static void decode_q_branch(SnowContext *s, int level, int x, int y){
2276
    const int w= s->b_width << s->block_max_depth;
2277
    const int rem_depth= s->block_max_depth - level;
2278
    const int index= (x + y*w) << rem_depth;
2279
    int trx= (x+1)<<rem_depth;
2280
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2281
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2282
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2283
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2284
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2285

    
2286
    if(s->keyframe){
2287
        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2288
        return;
2289
    }
2290

    
2291
    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2292
        int type, mx, my;
2293
        int l = left->color[0];
2294
        int cb= left->color[1];
2295
        int cr= left->color[2];
2296
        int ref = 0;
2297
        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2298
        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2299
        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2300

    
2301
        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2302

    
2303
        if(type){
2304
            pred_mv(s, &mx, &my, 0, left, top, tr);
2305
            l += get_symbol(&s->c, &s->block_state[32], 1);
2306
            cb+= get_symbol(&s->c, &s->block_state[64], 1);
2307
            cr+= get_symbol(&s->c, &s->block_state[96], 1);
2308
        }else{
2309
            if(s->ref_frames > 1)
2310
                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2311
            pred_mv(s, &mx, &my, ref, left, top, tr);
2312
            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2313
            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2314
        }
2315
        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2316
    }else{
2317
        decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2318
        decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2319
        decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2320
        decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2321
    }
2322
}
2323

    
2324
static void encode_blocks(SnowContext *s, int search){
2325
    int x, y;
2326
    int w= s->b_width;
2327
    int h= s->b_height;
2328

    
2329
    if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2330
        iterative_me(s);
2331

    
2332
    for(y=0; y<h; y++){
2333
        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2334
            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2335
            return;
2336
        }
2337
        for(x=0; x<w; x++){
2338
            if(s->avctx->me_method == ME_ITER || !search)
2339
                encode_q_branch2(s, 0, x, y);
2340
            else
2341
                encode_q_branch (s, 0, x, y);
2342
        }
2343
    }
2344
}
2345

    
2346
static void decode_blocks(SnowContext *s){
2347
    int x, y;
2348
    int w= s->b_width;
2349
    int h= s->b_height;
2350

    
2351
    for(y=0; y<h; y++){
2352
        for(x=0; x<w; x++){
2353
            decode_q_branch(s, 0, x, y);
2354
        }
2355
    }
2356
}
2357

    
2358
static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2359
    int x, y;
2360
START_TIMER
2361
    for(y=0; y < b_h+5; y++){
2362
        for(x=0; x < b_w; x++){
2363
            int a0= src[x    ];
2364
            int a1= src[x + 1];
2365
            int a2= src[x + 2];
2366
            int a3= src[x + 3];
2367
            int a4= src[x + 4];
2368
            int a5= src[x + 5];
2369
//            int am= 9*(a1+a2) - (a0+a3);
2370
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2371
//            int am= 18*(a2+a3) - 2*(a1+a4);
2372
//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2373
//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2374

    
2375
//            if(b_w==16) am= 8*(a1+a2);
2376

    
2377
            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2378
            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2379

    
2380
            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2381
            if(am&(~255)) am= ~(am>>31);
2382

    
2383
            tmp[x] = am;
2384

    
2385
/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2386
            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2387
            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2388
            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2389
        }
2390
        tmp += stride;
2391
        src += stride;
2392
    }
2393
    tmp -= (b_h+5)*stride;
2394

    
2395
    for(y=0; y < b_h; y++){
2396
        for(x=0; x < b_w; x++){
2397
            int a0= tmp[x + 0*stride];
2398
            int a1= tmp[x + 1*stride];
2399
            int a2= tmp[x + 2*stride];
2400
            int a3= tmp[x + 3*stride];
2401
            int a4= tmp[x + 4*stride];
2402
            int a5= tmp[x + 5*stride];
2403
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2404
//            int am= 18*(a2+a3) - 2*(a1+a4);
2405
/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2406
            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2407

    
2408
//            if(b_w==16) am= 8*(a1+a2);
2409

    
2410
            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2411
            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2412

    
2413
            if(am&(~255)) am= ~(am>>31);
2414

    
2415
            dst[x] = am;
2416
/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2417
            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2418
            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2419
            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2420
        }
2421
        dst += stride;
2422
        tmp += stride;
2423
    }
2424
STOP_TIMER("mc_block")
2425
}
2426

    
2427
#define mca(dx,dy,b_w)\
2428
static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2429
    uint8_t tmp[stride*(b_w+5)];\
2430
    assert(h==b_w);\
2431
    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2432
}
2433

    
2434
mca( 0, 0,16)
2435
mca( 8, 0,16)
2436
mca( 0, 8,16)
2437
mca( 8, 8,16)
2438
mca( 0, 0,8)
2439
mca( 8, 0,8)
2440
mca( 0, 8,8)
2441
mca( 8, 8,8)
2442

    
2443
static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2444
    if(block->type & BLOCK_INTRA){
2445
        int x, y;
2446
        const int color = block->color[plane_index];
2447
        const int color4= color*0x01010101;
2448
        if(b_w==32){
2449
            for(y=0; y < b_h; y++){
2450
                *(uint32_t*)&dst[0 + y*stride]= color4;
2451
                *(uint32_t*)&dst[4 + y*stride]= color4;
2452
                *(uint32_t*)&dst[8 + y*stride]= color4;
2453
                *(uint32_t*)&dst[12+ y*stride]= color4;
2454
                *(uint32_t*)&dst[16+ y*stride]= color4;
2455
                *(uint32_t*)&dst[20+ y*stride]= color4;
2456
                *(uint32_t*)&dst[24+ y*stride]= color4;
2457
                *(uint32_t*)&dst[28+ y*stride]= color4;
2458
            }
2459
        }else if(b_w==16){
2460
            for(y=0; y < b_h; y++){
2461
                *(uint32_t*)&dst[0 + y*stride]= color4;
2462
                *(uint32_t*)&dst[4 + y*stride]= color4;
2463
                *(uint32_t*)&dst[8 + y*stride]= color4;
2464
                *(uint32_t*)&dst[12+ y*stride]= color4;
2465
            }
2466
        }else if(b_w==8){
2467
            for(y=0; y < b_h; y++){
2468
                *(uint32_t*)&dst[0 + y*stride]= color4;
2469
                *(uint32_t*)&dst[4 + y*stride]= color4;
2470
            }
2471
        }else if(b_w==4){
2472
            for(y=0; y < b_h; y++){
2473
                *(uint32_t*)&dst[0 + y*stride]= color4;
2474
            }
2475
        }else{
2476
            for(y=0; y < b_h; y++){
2477
                for(x=0; x < b_w; x++){
2478
                    dst[x + y*stride]= color;
2479
                }
2480
            }
2481
        }
2482
    }else{
2483
        uint8_t *src= s->last_picture[block->ref].data[plane_index];
2484
        const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2485
        int mx= block->mx*scale;
2486
        int my= block->my*scale;
2487
        const int dx= mx&15;
2488
        const int dy= my&15;
2489
        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2490
        sx += (mx>>4) - 2;
2491
        sy += (my>>4) - 2;
2492
        src += sx + sy*stride;
2493
        if(   (unsigned)sx >= w - b_w - 4
2494
           || (unsigned)sy >= h - b_h - 4){
2495
            ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2496
            src= tmp + MB_SIZE;
2497
        }
2498
//        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2499
//        assert(!(b_w&(b_w-1)));
2500
        assert(b_w>1 && b_h>1);
2501
        assert(tab_index>=0 && tab_index<4 || b_w==32);
2502
        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2503
            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2504
        else if(b_w==32){
2505
            int y;
2506
            for(y=0; y<b_h; y+=16){
2507
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2508
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2509
            }
2510
        }else if(b_w==b_h)
2511
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2512
        else if(b_w==2*b_h){
2513
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2514
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2515
        }else{
2516
            assert(2*b_w==b_h);
2517
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2518
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2519
        }
2520
    }
2521
}
2522

    
2523
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2524
                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2525
    int y, x;
2526
    DWTELEM * dst;
2527
    for(y=0; y<b_h; y++){
2528
        //FIXME ugly missue of obmc_stride
2529
        const uint8_t *obmc1= obmc + y*obmc_stride;
2530
        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2531
        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2532
        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2533
        dst = slice_buffer_get_line(sb, src_y + y);
2534
        for(x=0; x<b_w; x++){
2535
            int v=   obmc1[x] * block[3][x + y*src_stride]
2536
                    +obmc2[x] * block[2][x + y*src_stride]
2537
                    +obmc3[x] * block[1][x + y*src_stride]
2538
                    +obmc4[x] * block[0][x + y*src_stride];
2539

    
2540
            v <<= 8 - LOG2_OBMC_MAX;
2541
            if(FRAC_BITS != 8){
2542
                v += 1<<(7 - FRAC_BITS);
2543
                v >>= 8 - FRAC_BITS;
2544
            }
2545
            if(add){
2546
                v += dst[x + src_x];
2547
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2548
                if(v&(~255)) v= ~(v>>31);
2549
                dst8[x + y*src_stride] = v;
2550
            }else{
2551
                dst[x + src_x] -= v;
2552
            }
2553
        }
2554
    }
2555
}
2556

    
2557
//FIXME name clenup (b_w, block_w, b_width stuff)
2558
static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2559
    const int b_width = s->b_width  << s->block_max_depth;
2560
    const int b_height= s->b_height << s->block_max_depth;
2561
    const int b_stride= b_width;
2562
    BlockNode *lt= &s->block[b_x + b_y*b_stride];
2563
    BlockNode *rt= lt+1;
2564
    BlockNode *lb= lt+b_stride;
2565
    BlockNode *rb= lb+1;
2566
    uint8_t *block[4];
2567
    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2568
    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2569
    uint8_t *ptmp;
2570
    int x,y;
2571

    
2572
    if(b_x<0){
2573
        lt= rt;
2574
        lb= rb;
2575
    }else if(b_x + 1 >= b_width){
2576
        rt= lt;
2577
        rb= lb;
2578
    }
2579
    if(b_y<0){
2580
        lt= lb;
2581
        rt= rb;
2582
    }else if(b_y + 1 >= b_height){
2583
        lb= lt;
2584
        rb= rt;
2585
    }
2586

    
2587
    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2588
        obmc -= src_x;
2589
        b_w += src_x;
2590
        if(!sliced && !offset_dst)
2591
            dst -= src_x;
2592
        src_x=0;
2593
    }else if(src_x + b_w > w){
2594
        b_w = w - src_x;
2595
    }
2596
    if(src_y<0){
2597
        obmc -= src_y*obmc_stride;
2598
        b_h += src_y;
2599
        if(!sliced && !offset_dst)
2600
            dst -= src_y*dst_stride;
2601
        src_y=0;
2602
    }else if(src_y + b_h> h){
2603
        b_h = h - src_y;
2604
    }
2605

    
2606
    if(b_w<=0 || b_h<=0) return;
2607

    
2608
assert(src_stride > 2*MB_SIZE + 5);
2609
    if(!sliced && offset_dst)
2610
        dst += src_x + src_y*dst_stride;
2611
    dst8+= src_x + src_y*src_stride;
2612
//    src += src_x + src_y*src_stride;
2613

    
2614
    ptmp= tmp + 3*tmp_step;
2615
    block[0]= ptmp;
2616
    ptmp+=tmp_step;
2617
    pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2618

    
2619
    if(same_block(lt, rt)){
2620
        block[1]= block[0];
2621
    }else{
2622
        block[1]= ptmp;
2623
        ptmp+=tmp_step;
2624
        pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2625
    }
2626

    
2627
    if(same_block(lt, lb)){
2628
        block[2]= block[0];
2629
    }else if(same_block(rt, lb)){
2630
        block[2]= block[1];
2631
    }else{
2632
        block[2]= ptmp;
2633
        ptmp+=tmp_step;
2634
        pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2635
    }
2636

    
2637
    if(same_block(lt, rb) ){
2638
        block[3]= block[0];
2639
    }else if(same_block(rt, rb)){
2640
        block[3]= block[1];
2641
    }else if(same_block(lb, rb)){
2642
        block[3]= block[2];
2643
    }else{
2644
        block[3]= ptmp;
2645
        pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2646
    }
2647
#if 0
2648
    for(y=0; y<b_h; y++){
2649
        for(x=0; x<b_w; x++){
2650
            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2651
            if(add) dst[x + y*dst_stride] += v;
2652
            else    dst[x + y*dst_stride] -= v;
2653
        }
2654
    }
2655
    for(y=0; y<b_h; y++){
2656
        uint8_t *obmc2= obmc + (obmc_stride>>1);
2657
        for(x=0; x<b_w; x++){
2658
            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2659
            if(add) dst[x + y*dst_stride] += v;
2660
            else    dst[x + y*dst_stride] -= v;
2661
        }
2662
    }
2663
    for(y=0; y<b_h; y++){
2664
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2665
        for(x=0; x<b_w; x++){
2666
            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2667
            if(add) dst[x + y*dst_stride] += v;
2668
            else    dst[x + y*dst_stride] -= v;
2669
        }
2670
    }
2671
    for(y=0; y<b_h; y++){
2672
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2673
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2674
        for(x=0; x<b_w; x++){
2675
            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2676
            if(add) dst[x + y*dst_stride] += v;
2677
            else    dst[x + y*dst_stride] -= v;
2678
        }
2679
    }
2680
#else
2681
    if(sliced){
2682
        START_TIMER
2683

    
2684
        s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2685
        STOP_TIMER("inner_add_yblock")
2686
    }else
2687
    for(y=0; y<b_h; y++){
2688
        //FIXME ugly missue of obmc_stride
2689
        const uint8_t *obmc1= obmc + y*obmc_stride;
2690
        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2691
        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2692
        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2693
        for(x=0; x<b_w; x++){
2694
            int v=   obmc1[x] * block[3][x + y*src_stride]
2695
                    +obmc2[x] * block[2][x + y*src_stride]
2696
                    +obmc3[x] * block[1][x + y*src_stride]
2697
                    +obmc4[x] * block[0][x + y*src_stride];
2698

    
2699
            v <<= 8 - LOG2_OBMC_MAX;
2700
            if(FRAC_BITS != 8){
2701
                v += 1<<(7 - FRAC_BITS);
2702
                v >>= 8 - FRAC_BITS;
2703
            }
2704
            if(add){
2705
                v += dst[x + y*dst_stride];
2706
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2707
                if(v&(~255)) v= ~(v>>31);
2708
                dst8[x + y*src_stride] = v;
2709
            }else{
2710
                dst[x + y*dst_stride] -= v;
2711
            }
2712
        }
2713
    }
2714
#endif
2715
}
2716

    
2717
static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2718
    Plane *p= &s->plane[plane_index];
2719
    const int mb_w= s->b_width  << s->block_max_depth;
2720
    const int mb_h= s->b_height << s->block_max_depth;
2721
    int x, y, mb_x;
2722
    int block_size = MB_SIZE >> s->block_max_depth;
2723
    int block_w    = plane_index ? block_size/2 : block_size;
2724
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2725
    int obmc_stride= plane_index ? block_size : 2*block_size;
2726
    int ref_stride= s->current_picture.linesize[plane_index];
2727
    uint8_t *dst8= s->current_picture.data[plane_index];
2728
    int w= p->width;
2729
    int h= p->height;
2730
    START_TIMER
2731

    
2732
    if(s->keyframe || (s->avctx->debug&512)){
2733
        if(mb_y==mb_h)
2734
            return;
2735

    
2736
        if(add){
2737
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2738
            {
2739
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2740
                DWTELEM * line = sb->line[y];
2741
                for(x=0; x<w; x++)
2742
                {
2743
//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2744
                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2745
                    v >>= FRAC_BITS;
2746
                    if(v&(~255)) v= ~(v>>31);
2747
                    dst8[x + y*ref_stride]= v;
2748
                }
2749
            }
2750
        }else{
2751
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2752
            {
2753
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2754
                DWTELEM * line = sb->line[y];
2755
                for(x=0; x<w; x++)
2756
                {
2757
                    line[x] -= 128 << FRAC_BITS;
2758
//                    buf[x + y*w]-= 128<<FRAC_BITS;
2759
                }
2760
            }
2761
        }
2762

    
2763
        return;
2764
    }
2765

    
2766
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2767
            START_TIMER
2768

    
2769
            add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2770
                       block_w*mb_x - block_w/2,
2771
                       block_w*mb_y - block_w/2,
2772
                       block_w, block_w,
2773
                       w, h,
2774
                       w, ref_stride, obmc_stride,
2775
                       mb_x - 1, mb_y - 1,
2776
                       add, 0, plane_index);
2777

    
2778
            STOP_TIMER("add_yblock")
2779
        }
2780

    
2781
    STOP_TIMER("predict_slice")
2782
}
2783

    
2784
static av_always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2785
    Plane *p= &s->plane[plane_index];
2786
    const int mb_w= s->b_width  << s->block_max_depth;
2787
    const int mb_h= s->b_height << s->block_max_depth;
2788
    int x, y, mb_x;
2789
    int block_size = MB_SIZE >> s->block_max_depth;
2790
    int block_w    = plane_index ? block_size/2 : block_size;
2791
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2792
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2793
    int ref_stride= s->current_picture.linesize[plane_index];
2794
    uint8_t *dst8= s->current_picture.data[plane_index];
2795
    int w= p->width;
2796
    int h= p->height;
2797
    START_TIMER
2798

    
2799
    if(s->keyframe || (s->avctx->debug&512)){
2800
        if(mb_y==mb_h)
2801
            return;
2802

    
2803
        if(add){
2804
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2805
                for(x=0; x<w; x++){
2806
                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2807
                    v >>= FRAC_BITS;
2808
                    if(v&(~255)) v= ~(v>>31);
2809
                    dst8[x + y*ref_stride]= v;
2810
                }
2811
            }
2812
        }else{
2813
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2814
                for(x=0; x<w; x++){
2815
                    buf[x + y*w]-= 128<<FRAC_BITS;
2816
                }
2817
            }
2818
        }
2819

    
2820
        return;
2821
    }
2822

    
2823
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2824
            START_TIMER
2825

    
2826
            add_yblock(s, 0, NULL, buf, dst8, obmc,
2827
                       block_w*mb_x - block_w/2,
2828
                       block_w*mb_y - block_w/2,
2829
                       block_w, block_w,
2830
                       w, h,
2831
                       w, ref_stride, obmc_stride,
2832
                       mb_x - 1, mb_y - 1,
2833
                       add, 1, plane_index);
2834

    
2835
            STOP_TIMER("add_yblock")
2836
        }
2837

    
2838
    STOP_TIMER("predict_slice")
2839
}
2840

    
2841
static av_always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2842
    const int mb_h= s->b_height << s->block_max_depth;
2843
    int mb_y;
2844
    for(mb_y=0; mb_y<=mb_h; mb_y++)
2845
        predict_slice(s, buf, plane_index, add, mb_y);
2846
}
2847

    
2848
static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2849
    int i, x2, y2;
2850
    Plane *p= &s->plane[plane_index];
2851
    const int block_size = MB_SIZE >> s->block_max_depth;
2852
    const int block_w    = plane_index ? block_size/2 : block_size;
2853
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2854
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2855
    const int ref_stride= s->current_picture.linesize[plane_index];
2856
    uint8_t *src= s-> input_picture.data[plane_index];
2857
    DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2858
    const int b_stride = s->b_width << s->block_max_depth;
2859
    const int w= p->width;
2860
    const int h= p->height;
2861
    int index= mb_x + mb_y*b_stride;
2862
    BlockNode *b= &s->block[index];
2863
    BlockNode backup= *b;
2864
    int ab=0;
2865
    int aa=0;
2866

    
2867
    b->type|= BLOCK_INTRA;
2868
    b->color[plane_index]= 0;
2869
    memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2870

    
2871
    for(i=0; i<4; i++){
2872
        int mb_x2= mb_x + (i &1) - 1;
2873
        int mb_y2= mb_y + (i>>1) - 1;
2874
        int x= block_w*mb_x2 + block_w/2;
2875
        int y= block_w*mb_y2 + block_w/2;
2876

    
2877
        add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2878
                    x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2879

    
2880
        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2881
            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2882
                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2883
                int obmc_v= obmc[index];
2884
                int d;
2885
                if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2886
                if(x<0) obmc_v += obmc[index + block_w];
2887
                if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2888
                if(x+block_w>w) obmc_v += obmc[index - block_w];
2889
                //FIXME precalc this or simplify it somehow else
2890

    
2891
                d = -dst[index] + (1<<(FRAC_BITS-1));
2892
                dst[index] = d;
2893
                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2894
                aa += obmc_v * obmc_v; //FIXME precalclate this
2895
            }
2896
        }
2897
    }
2898
    *b= backup;
2899

    
2900
    return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
2901
}
2902

    
2903
static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2904
    const int b_stride = s->b_width << s->block_max_depth;
2905
    const int b_height = s->b_height<< s->block_max_depth;
2906
    int index= x + y*b_stride;
2907
    const BlockNode *b     = &s->block[index];
2908
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2909
    const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2910
    const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2911
    const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2912
    int dmx, dmy;
2913
//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2914
//  int my_context= av_log2(2*FFABS(left->my - top->my));
2915

    
2916
    if(x<0 || x>=b_stride || y>=b_height)
2917
        return 0;
2918
/*
2919
1            0      0
2920
01X          1-2    1
2921
001XX        3-6    2-3
2922
0001XXX      7-14   4-7
2923
00001XXXX   15-30   8-15
2924
*/
2925
//FIXME try accurate rate
2926
//FIXME intra and inter predictors if surrounding blocks arent the same type
2927
    if(b->type & BLOCK_INTRA){
2928
        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2929
                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
2930
                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
2931
    }else{
2932
        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2933
        dmx-= b->mx;
2934
        dmy-= b->my;
2935
        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2936
                    + av_log2(2*FFABS(dmy))
2937
                    + av_log2(2*b->ref));
2938
    }
2939
}
2940

    
2941
static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2942
    Plane *p= &s->plane[plane_index];
2943
    const int block_size = MB_SIZE >> s->block_max_depth;
2944
    const int block_w    = plane_index ? block_size/2 : block_size;
2945
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2946
    const int ref_stride= s->current_picture.linesize[plane_index];
2947
    uint8_t *dst= s->current_picture.data[plane_index];
2948
    uint8_t *src= s->  input_picture.data[plane_index];
2949
    DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2950
    uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2951
    uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
2952
    const int b_stride = s->b_width << s->block_max_depth;
2953
    const int b_height = s->b_height<< s->block_max_depth;
2954
    const int w= p->width;
2955
    const int h= p->height;
2956
    int distortion;
2957
    int rate= 0;
2958
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2959
    int sx= block_w*mb_x - block_w/2;
2960
    int sy= block_w*mb_y - block_w/2;
2961
    int x0= FFMAX(0,-sx);
2962
    int y0= FFMAX(0,-sy);
2963
    int x1= FFMIN(block_w*2, w-sx);
2964
    int y1= FFMIN(block_w*2, h-sy);
2965
    int i,x,y;
2966

    
2967
    pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2968

    
2969
    for(y=y0; y<y1; y++){
2970
        const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2971
        const DWTELEM *pred1 = pred + y*obmc_stride;
2972
        uint8_t *cur1 = cur + y*ref_stride;
2973
        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2974
        for(x=x0; x<x1; x++){
2975
            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2976
            v = (v + pred1[x]) >> FRAC_BITS;
2977
            if(v&(~255)) v= ~(v>>31);
2978
            dst1[x] = v;
2979
        }
2980
    }
2981

    
2982
    /* copy the regions where obmc[] = (uint8_t)256 */
2983
    if(LOG2_OBMC_MAX == 8
2984
        && (mb_x == 0 || mb_x == b_stride-1)
2985
        && (mb_y == 0 || mb_y == b_height-1)){
2986
        if(mb_x == 0)
2987
            x1 = block_w;
2988
        else
2989
            x0 = block_w;
2990
        if(mb_y == 0)
2991
            y1 = block_w;
2992
        else
2993
            y0 = block_w;
2994
        for(y=y0; y<y1; y++)
2995
            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2996
    }
2997

    
2998
    if(block_w==16){
2999
        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
3000
        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
3001
        /* FIXME cmps overlap but don't cover the wavelet's whole support,
3002
         * so improving the score of one block is not strictly guaranteed to
3003
         * improve the score of the whole frame, so iterative motion est
3004
         * doesn't always converge. */
3005
        if(s->avctx->me_cmp == FF_CMP_W97)
3006
            distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3007
        else if(s->avctx->me_cmp == FF_CMP_W53)
3008
            distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3009
        else{
3010
            distortion = 0;
3011
            for(i=0; i<4; i++){
3012
                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3013
                distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3014
            }
3015
        }
3016
    }else{
3017
        assert(block_w==8);
3018
        distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3019
    }
3020

    
3021
    if(plane_index==0){
3022
        for(i=0; i<4; i++){
3023
/* ..RRr
3024
 * .RXx.
3025
 * rxx..
3026
 */
3027
            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3028
        }
3029
        if(mb_x == b_stride-2)
3030
            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3031
    }
3032
    return distortion + rate*penalty_factor;
3033
}
3034

    
3035
static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3036
    int i, y2;
3037
    Plane *p= &s->plane[plane_index];
3038
    const int block_size = MB_SIZE >> s->block_max_depth;
3039
    const int block_w    = plane_index ? block_size/2 : block_size;
3040
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3041
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3042
    const int ref_stride= s->current_picture.linesize[plane_index];
3043
    uint8_t *dst= s->current_picture.data[plane_index];
3044
    uint8_t *src= s-> input_picture.data[plane_index];
3045
    static const DWTELEM zero_dst[4096]; //FIXME
3046
    const int b_stride = s->b_width << s->block_max_depth;
3047
    const int w= p->width;
3048
    const int h= p->height;
3049
    int distortion= 0;
3050
    int rate= 0;
3051
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3052

    
3053
    for(i=0; i<9; i++){
3054
        int mb_x2= mb_x + (i%3) - 1;
3055
        int mb_y2= mb_y + (i/3) - 1;
3056
        int x= block_w*mb_x2 + block_w/2;
3057
        int y= block_w*mb_y2 + block_w/2;
3058

    
3059
        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
3060
                   x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3061

    
3062
        //FIXME find a cleaner/simpler way to skip the outside stuff
3063
        for(y2= y; y2<0; y2++)
3064
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3065
        for(y2= h; y2<y+block_w; y2++)
3066
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3067
        if(x<0){
3068
            for(y2= y; y2<y+block_w; y2++)
3069
                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3070
        }
3071
        if(x+block_w > w){
3072
            for(y2= y; y2<y+block_w; y2++)
3073
                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3074
        }
3075

    
3076
        assert(block_w== 8 || block_w==16);
3077
        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3078
    }
3079

    
3080
    if(plane_index==0){
3081
        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3082
        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3083

    
3084
/* ..RRRr
3085
 * .RXXx.
3086
 * .RXXx.
3087
 * rxxx.
3088
 */
3089
        if(merged)
3090
            rate = get_block_bits(s, mb_x, mb_y, 2);
3091
        for(i=merged?4:0; i<9; i++){
3092
            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3093
            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3094
        }
3095
    }
3096
    return distortion + rate*penalty_factor;
3097
}
3098

    
3099
static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3100
    const int b_stride= s->b_width << s->block_max_depth;
3101
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3102
    BlockNode backup= *block;
3103
    int rd, index, value;
3104

    
3105
    assert(mb_x>=0 && mb_y>=0);
3106
    assert(mb_x<b_stride);
3107

    
3108
    if(intra){
3109
        block->color[0] = p[0];
3110
        block->color[1] = p[1];
3111
        block->color[2] = p[2];
3112
        block->type |= BLOCK_INTRA;
3113
    }else{
3114
        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3115
        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3116
        if(s->me_cache[index] == value)
3117
            return 0;
3118
        s->me_cache[index]= value;
3119

    
3120
        block->mx= p[0];
3121
        block->my= p[1];
3122
        block->type &= ~BLOCK_INTRA;
3123
    }
3124

    
3125
    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3126

    
3127
//FIXME chroma
3128
    if(rd < *best_rd){
3129
        *best_rd= rd;
3130
        return 1;
3131
    }else{
3132
        *block= backup;
3133
        return 0;
3134
    }
3135
}
3136

    
3137
/* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3138
static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3139
    int p[2] = {p0, p1};
3140
    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3141
}
3142

    
3143
static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3144
    const int b_stride= s->b_width << s->block_max_depth;
3145
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3146
    BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3147
    int rd, index, value;
3148

    
3149
    assert(mb_x>=0 && mb_y>=0);
3150
    assert(mb_x<b_stride);
3151
    assert(((mb_x|mb_y)&1) == 0);
3152

    
3153
    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3154
    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3155
    if(s->me_cache[index] == value)
3156
        return 0;
3157
    s->me_cache[index]= value;
3158

    
3159
    block->mx= p0;
3160
    block->my= p1;
3161
    block->ref= ref;
3162
    block->type &= ~BLOCK_INTRA;
3163
    block[1]= block[b_stride]= block[b_stride+1]= *block;
3164

    
3165
    rd= get_4block_rd(s, mb_x, mb_y, 0);
3166

    
3167
//FIXME chroma
3168
    if(rd < *best_rd){
3169
        *best_rd= rd;
3170
        return 1;
3171
    }else{
3172
        block[0]= backup[0];
3173
        block[1]= backup[1];
3174
        block[b_stride]= backup[2];
3175
        block[b_stride+1]= backup[3];
3176
        return 0;
3177
    }
3178
}
3179

    
3180
static void iterative_me(SnowContext *s){
3181
    int pass, mb_x, mb_y;
3182
    const int b_width = s->b_width  << s->block_max_depth;
3183
    const int b_height= s->b_height << s->block_max_depth;
3184
    const int b_stride= b_width;
3185
    int color[3];
3186

    
3187
    {
3188
        RangeCoder r = s->c;
3189
        uint8_t state[sizeof(s->block_state)];
3190
        memcpy(state, s->block_state, sizeof(s->block_state));
3191
        for(mb_y= 0; mb_y<s->b_height; mb_y++)
3192
            for(mb_x= 0; mb_x<s->b_width; mb_x++)
3193
                encode_q_branch(s, 0, mb_x, mb_y);
3194
        s->c = r;
3195
        memcpy(s->block_state, state, sizeof(s->block_state));
3196
    }
3197

    
3198
    for(pass=0; pass<25; pass++){
3199
        int change= 0;
3200

    
3201
        for(mb_y= 0; mb_y<b_height; mb_y++){
3202
            for(mb_x= 0; mb_x<b_width; mb_x++){
3203
                int dia_change, i, j, ref;
3204
                int best_rd= INT_MAX, ref_rd;
3205
                BlockNode backup, ref_b;
3206
                const int index= mb_x + mb_y * b_stride;
3207
                BlockNode *block= &s->block[index];
3208
                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3209
                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3210
                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3211
                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3212
                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3213
                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3214
                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3215
                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3216
                const int b_w= (MB_SIZE >> s->block_max_depth);
3217
                uint8_t obmc_edged[b_w*2][b_w*2];
3218

    
3219
                if(pass && (block->type & BLOCK_OPT))
3220
                    continue;
3221
                block->type |= BLOCK_OPT;
3222

    
3223
                backup= *block;
3224

    
3225
                if(!s->me_cache_generation)
3226
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3227
                s->me_cache_generation += 1<<22;
3228

    
3229
                //FIXME precalc
3230
                {
3231
                    int x, y;
3232
                    memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3233
                    if(mb_x==0)
3234
                        for(y=0; y<b_w*2; y++)
3235
                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3236
                    if(mb_x==b_stride-1)
3237
                        for(y=0; y<b_w*2; y++)
3238
                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3239
                    if(mb_y==0){
3240
                        for(x=0; x<b_w*2; x++)
3241
                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
3242
                        for(y=1; y<b_w; y++)
3243
                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3244
                    }
3245
                    if(mb_y==b_height-1){
3246
                        for(x=0; x<b_w*2; x++)
3247
                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3248
                        for(y=b_w; y<b_w*2-1; y++)
3249
                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3250
                    }
3251
                }
3252

    
3253
                //skip stuff outside the picture
3254
                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3255
                {
3256
                    uint8_t *src= s->  input_picture.data[0];
3257
                    uint8_t *dst= s->current_picture.data[0];
3258
                    const int stride= s->current_picture.linesize[0];
3259
                    const int block_w= MB_SIZE >> s->block_max_depth;
3260
                    const int sx= block_w*mb_x - block_w/2;
3261
                    const int sy= block_w*mb_y - block_w/2;
3262
                    const int w= s->plane[0].width;
3263
                    const int h= s->plane[0].height;
3264
                    int y;
3265

    
3266
                    for(y=sy; y<0; y++)
3267
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3268
                    for(y=h; y<sy+block_w*2; y++)
3269
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3270
                    if(sx<0){
3271
                        for(y=sy; y<sy+block_w*2; y++)
3272
                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3273
                    }
3274
                    if(sx+block_w*2 > w){
3275
                        for(y=sy; y<sy+block_w*2; y++)
3276
                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3277
                    }
3278
                }
3279

    
3280
                // intra(black) = neighbors' contribution to the current block
3281
                for(i=0; i<3; i++)
3282
                    color[i]= get_dc(s, mb_x, mb_y, i);
3283

    
3284
                // get previous score (cant be cached due to OBMC)
3285
                if(pass > 0 && (block->type&BLOCK_INTRA)){
3286
                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
3287
                    check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3288
                }else
3289
                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3290

    
3291
                ref_b= *block;
3292
                ref_rd= best_rd;
3293
                for(ref=0; ref < s->ref_frames; ref++){
3294
                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3295
                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3296
                        continue;
3297
                    block->ref= ref;
3298
                    best_rd= INT_MAX;
3299

    
3300
                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3301
                    check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3302
                    if(tb)
3303
                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3304
                    if(lb)
3305
                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3306
                    if(rb)
3307
                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3308
                    if(bb)
3309
                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3310

    
3311
                    /* fullpel ME */
3312
                    //FIXME avoid subpel interpol / round to nearest integer
3313
                    do{
3314
                        dia_change=0;
3315
                        for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3316
                            for(j=0; j<i; j++){
3317
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3318
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3319
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3320
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3321
                            }
3322
                        }
3323
                    }while(dia_change);
3324
                    /* subpel ME */
3325
                    do{
3326
                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3327
                        dia_change=0;
3328
                        for(i=0; i<8; i++)
3329
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3330
                    }while(dia_change);
3331
                    //FIXME or try the standard 2 pass qpel or similar
3332

    
3333
                    mvr[0][0]= block->mx;
3334
                    mvr[0][1]= block->my;
3335
                    if(ref_rd > best_rd){
3336
                        ref_rd= best_rd;
3337
                        ref_b= *block;
3338
                    }
3339
                }
3340
                best_rd= ref_rd;
3341
                *block= ref_b;
3342
#if 1
3343
                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3344
                //FIXME RD style color selection
3345
#endif
3346
                if(!same_block(block, &backup)){
3347
                    if(tb ) tb ->type &= ~BLOCK_OPT;
3348
                    if(lb ) lb ->type &= ~BLOCK_OPT;
3349
                    if(rb ) rb ->type &= ~BLOCK_OPT;
3350
                    if(bb ) bb ->type &= ~BLOCK_OPT;
3351
                    if(tlb) tlb->type &= ~BLOCK_OPT;
3352
                    if(trb) trb->type &= ~BLOCK_OPT;
3353
                    if(blb) blb->type &= ~BLOCK_OPT;
3354
                    if(brb) brb->type &= ~BLOCK_OPT;
3355
                    change ++;
3356
                }
3357
            }
3358
        }
3359
        av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3360
        if(!change)
3361
            break;
3362
    }
3363

    
3364
    if(s->block_max_depth == 1){
3365
        int change= 0;
3366
        for(mb_y= 0; mb_y<b_height; mb_y+=2){
3367
            for(mb_x= 0; mb_x<b_width; mb_x+=2){
3368
                int i;
3369
                int best_rd, init_rd;
3370
                const int index= mb_x + mb_y * b_stride;
3371
                BlockNode *b[4];
3372

    
3373
                b[0]= &s->block[index];
3374
                b[1]= b[0]+1;
3375
                b[2]= b[0]+b_stride;
3376
                b[3]= b[2]+1;
3377
                if(same_block(b[0], b[1]) &&
3378
                   same_block(b[0], b[2]) &&
3379
                   same_block(b[0], b[3]))
3380
                    continue;
3381

    
3382
                if(!s->me_cache_generation)
3383
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3384
                s->me_cache_generation += 1<<22;
3385

    
3386
                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3387

    
3388
                //FIXME more multiref search?
3389
                check_4block_inter(s, mb_x, mb_y,
3390
                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3391
                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3392

    
3393
                for(i=0; i<4; i++)
3394
                    if(!(b[i]->type&BLOCK_INTRA))
3395
                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3396

    
3397
                if(init_rd != best_rd)
3398
                    change++;
3399
            }
3400
        }
3401
        av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3402
    }
3403
}
3404

    
3405
static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3406
    const int level= b->level;
3407
    const int w= b->width;
3408
    const int h= b->height;
3409
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3410
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3411
    int x,y, thres1, thres2;
3412
//    START_TIMER
3413

    
3414
    if(s->qlog == LOSSLESS_QLOG) return;
3415

    
3416
    bias= bias ? 0 : (3*qmul)>>3;
3417
    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3418
    thres2= 2*thres1;
3419

    
3420
    if(!bias){
3421
        for(y=0; y<h; y++){
3422
            for(x=0; x<w; x++){
3423
                int i= src[x + y*stride];
3424

    
3425
                if((unsigned)(i+thres1) > thres2){
3426
                    if(i>=0){
3427
                        i<<= QEXPSHIFT;
3428
                        i/= qmul; //FIXME optimize
3429
                        src[x + y*stride]=  i;
3430
                    }else{
3431
                        i= -i;
3432
                        i<<= QEXPSHIFT;
3433
                        i/= qmul; //FIXME optimize
3434
                        src[x + y*stride]= -i;
3435
                    }
3436
                }else
3437
                    src[x + y*stride]= 0;
3438
            }
3439
        }
3440
    }else{
3441
        for(y=0; y<h; y++){
3442
            for(x=0; x<w; x++){
3443
                int i= src[x + y*stride];
3444

    
3445
                if((unsigned)(i+thres1) > thres2){
3446
                    if(i>=0){
3447
                        i<<= QEXPSHIFT;
3448
                        i= (i + bias) / qmul; //FIXME optimize
3449
                        src[x + y*stride]=  i;
3450
                    }else{
3451
                        i= -i;
3452
                        i<<= QEXPSHIFT;
3453
                        i= (i + bias) / qmul; //FIXME optimize
3454
                        src[x + y*stride]= -i;
3455
                    }
3456
                }else
3457
                    src[x + y*stride]= 0;
3458
            }
3459
        }
3460
    }
3461
    if(level+1 == s->spatial_decomposition_count){
3462
//        STOP_TIMER("quantize")
3463
    }
3464
}
3465

    
3466
static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3467
    const int w= b->width;
3468
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3469
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3470
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3471
    int x,y;
3472
    START_TIMER
3473

    
3474
    if(s->qlog == LOSSLESS_QLOG) return;
3475

    
3476
    for(y=start_y; y<end_y; y++){
3477
//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3478
        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3479
        for(x=0; x<w; x++){
3480
            int i= line[x];
3481
            if(i<0){
3482
                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3483
            }else if(i>0){
3484
                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3485
            }
3486
        }
3487
    }
3488
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3489
        STOP_TIMER("dquant")
3490
    }
3491
}
3492

    
3493
static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3494
    const int w= b->width;
3495
    const int h= b->height;
3496
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3497
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3498
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3499
    int x,y;
3500
    START_TIMER
3501

    
3502
    if(s->qlog == LOSSLESS_QLOG) return;
3503

    
3504
    for(y=0; y<h; y++){
3505
        for(x=0; x<w; x++){
3506
            int i= src[x + y*stride];
3507
            if(i<0){
3508
                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3509
            }else if(i>0){
3510
                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3511
            }
3512
        }
3513
    }
3514
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3515
        STOP_TIMER("dquant")
3516
    }
3517
}
3518

    
3519
static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3520
    const int w= b->width;
3521
    const int h= b->height;
3522
    int x,y;
3523

    
3524
    for(y=h-1; y>=0; y--){
3525
        for(x=w-1; x>=0; x--){
3526
            int i= x + y*stride;
3527

    
3528
            if(x){
3529
                if(use_median){
3530
                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3531
                    else  src[i] -= src[i - 1];
3532
                }else{
3533
                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3534
                    else  src[i] -= src[i - 1];
3535
                }
3536
            }else{
3537
                if(y) src[i] -= src[i - stride];
3538
            }
3539
        }
3540
    }
3541
}
3542

    
3543
static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3544
    const int w= b->width;
3545
    int x,y;
3546

    
3547
//    START_TIMER
3548

    
3549
    DWTELEM * line=0; // silence silly "could be used without having been initialized" warning
3550
    DWTELEM * prev;
3551

    
3552
    if (start_y != 0)
3553
        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3554

    
3555
    for(y=start_y; y<end_y; y++){
3556
        prev = line;
3557
//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3558
        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3559
        for(x=0; x<w; x++){
3560
            if(x){
3561
                if(use_median){
3562
                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3563
                    else  line[x] += line[x - 1];
3564
                }else{
3565
                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3566
                    else  line[x] += line[x - 1];
3567
                }
3568
            }else{
3569
                if(y) line[x] += prev[x];
3570
            }
3571
        }
3572
    }
3573

    
3574
//    STOP_TIMER("correlate")
3575
}
3576

    
3577
static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3578
    const int w= b->width;
3579
    const int h= b->height;
3580
    int x,y;
3581

    
3582
    for(y=0; y<h; y++){
3583
        for(x=0; x<w; x++){
3584
            int i= x + y*stride;
3585

    
3586
            if(x){
3587
                if(use_median){
3588
                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3589
                    else  src[i] += src[i - 1];
3590
                }else{
3591
                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3592
                    else  src[i] += src[i - 1];
3593
                }
3594
            }else{
3595
                if(y) src[i] += src[i - stride];
3596
            }
3597
        }
3598
    }
3599
}
3600

    
3601
static void encode_header(SnowContext *s){
3602
    int plane_index, level, orientation;
3603
    uint8_t kstate[32];
3604

    
3605
    memset(kstate, MID_STATE, sizeof(kstate));
3606

    
3607
    put_rac(&s->c, kstate, s->keyframe);
3608
    if(s->keyframe || s->always_reset){
3609
        reset_contexts(s);
3610
        s->last_spatial_decomposition_type=
3611
        s->last_qlog=
3612
        s->last_qbias=
3613
        s->last_mv_scale=
3614
        s->last_block_max_depth= 0;
3615
    }
3616
    if(s->keyframe){
3617
        put_symbol(&s->c, s->header_state, s->version, 0);
3618
        put_rac(&s->c, s->header_state, s->always_reset);
3619
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3620
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3621
        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3622
        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3623
        put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3624
        put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3625
        put_rac(&s->c, s->header_state, s->spatial_scalability);
3626
//        put_rac(&s->c, s->header_state, s->rate_scalability);
3627
        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3628

    
3629
        for(plane_index=0; plane_index<2; plane_index++){
3630
            for(level=0; level<s->spatial_decomposition_count; level++){
3631
                for(orientation=level ? 1:0; orientation<4; orientation++){
3632
                    if(orientation==2) continue;
3633
                    put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3634
                }
3635
            }
3636
        }
3637
    }
3638
    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3639
    put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3640
    put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3641
    put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3642
    put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3643

    
3644
    s->last_spatial_decomposition_type= s->spatial_decomposition_type;
3645
    s->last_qlog                      = s->qlog;
3646
    s->last_qbias                     = s->qbias;
3647
    s->last_mv_scale                  = s->mv_scale;
3648
    s->last_block_max_depth           = s->block_max_depth;
3649
}
3650

    
3651
static int decode_header(SnowContext *s){
3652
    int plane_index, level, orientation;
3653
    uint8_t kstate[32];
3654

    
3655
    memset(kstate, MID_STATE, sizeof(kstate));
3656

    
3657
    s->keyframe= get_rac(&s->c, kstate);
3658
    if(s->keyframe || s->always_reset){
3659
        reset_contexts(s);
3660
        s->spatial_decomposition_type=
3661
        s->qlog=
3662
        s->qbias=
3663
        s->mv_scale=
3664
        s->block_max_depth= 0;
3665
    }
3666
    if(s->keyframe){
3667
        s->version= get_symbol(&s->c, s->header_state, 0);
3668
        if(s->version>0){
3669
            av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3670
            return -1;
3671
        }
3672
        s->always_reset= get_rac(&s->c, s->header_state);
3673
        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3674
        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3675
        s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3676
        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3677
        s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3678
        s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3679
        s->spatial_scalability= get_rac(&s->c, s->header_state);
3680
//        s->rate_scalability= get_rac(&s->c, s->header_state);
3681
        s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3682

    
3683
        for(plane_index=0; plane_index<3; plane_index++){
3684
            for(level=0; level<s->spatial_decomposition_count; level++){
3685
                for(orientation=level ? 1:0; orientation<4; orientation++){
3686
                    int q;
3687
                    if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3688
                    else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3689
                    else                    q= get_symbol(&s->c, s->header_state, 1);
3690
                    s->plane[plane_index].band[level][orientation].qlog= q;
3691
                }
3692
            }
3693
        }
3694
    }
3695

    
3696
    s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3697
    if(s->spatial_decomposition_type > 2){
3698
        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3699
        return -1;
3700
    }
3701

    
3702
    s->qlog           += get_symbol(&s->c, s->header_state, 1);
3703
    s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3704
    s->qbias          += get_symbol(&s->c, s->header_state, 1);
3705
    s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3706
    if(s->block_max_depth > 1 || s->block_max_depth < 0){
3707
        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3708
        s->block_max_depth= 0;
3709
        return -1;
3710
    }
3711

    
3712
    return 0;
3713
}
3714

    
3715
static void init_qexp(void){
3716
    int i;
3717
    double v=128;
3718

    
3719
    for(i=0; i<QROOT; i++){
3720
        qexp[i]= lrintf(v);
3721
        v *= pow(2, 1.0 / QROOT);
3722
    }
3723
}
3724

    
3725
static int common_init(AVCodecContext *avctx){
3726
    SnowContext *s = avctx->priv_data;
3727
    int width, height;
3728
    int level, orientation, plane_index, dec;
3729
    int i, j;
3730

    
3731
    s->avctx= avctx;
3732

    
3733
    dsputil_init(&s->dsp, avctx);
3734

    
3735
#define mcf(dx,dy)\
3736
    s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3737
    s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3738
        s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3739
    s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3740
    s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3741
        s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3742

    
3743
    mcf( 0, 0)
3744
    mcf( 4, 0)
3745
    mcf( 8, 0)
3746
    mcf(12, 0)
3747
    mcf( 0, 4)
3748
    mcf( 4, 4)
3749
    mcf( 8, 4)
3750
    mcf(12, 4)
3751
    mcf( 0, 8)
3752
    mcf( 4, 8)
3753
    mcf( 8, 8)
3754
    mcf(12, 8)
3755
    mcf( 0,12)
3756
    mcf( 4,12)
3757
    mcf( 8,12)
3758
    mcf(12,12)
3759

    
3760
#define mcfh(dx,dy)\
3761
    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3762
    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3763
        mc_block_hpel ## dx ## dy ## 16;\
3764
    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3765
    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3766
        mc_block_hpel ## dx ## dy ## 8;
3767

    
3768
    mcfh(0, 0)
3769
    mcfh(8, 0)
3770
    mcfh(0, 8)
3771
    mcfh(8, 8)
3772

    
3773
    if(!qexp[0])
3774
        init_qexp();
3775

    
3776
    dec= s->spatial_decomposition_count= 5;
3777
    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3778

    
3779
    s->chroma_h_shift= 1; //FIXME XXX
3780
    s->chroma_v_shift= 1;
3781

    
3782
//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3783

    
3784
    width= s->avctx->width;
3785
    height= s->avctx->height;
3786

    
3787
    s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3788

    
3789
    s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3790
    s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3791

    
3792
    for(plane_index=0; plane_index<3; plane_index++){
3793
        int w= s->avctx->width;
3794
        int h= s->avctx->height;
3795

    
3796
        if(plane_index){
3797
            w>>= s->chroma_h_shift;
3798
            h>>= s->chroma_v_shift;
3799
        }
3800
        s->plane[plane_index].width = w;
3801
        s->plane[plane_index].height= h;
3802
//av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3803
        for(level=s->spatial_decomposition_count-1; level>=0; level--){
3804
            for(orientation=level ? 1 : 0; orientation<4; orientation++){
3805
                SubBand *b= &s->plane[plane_index].band[level][orientation];
3806

    
3807
                b->buf= s->spatial_dwt_buffer;
3808
                b->level= level;
3809
                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3810
                b->width = (w + !(orientation&1))>>1;
3811
                b->height= (h + !(orientation>1))>>1;
3812

    
3813
                b->stride_line = 1 << (s->spatial_decomposition_count - level);
3814
                b->buf_x_offset = 0;
3815
                b->buf_y_offset = 0;
3816

    
3817
                if(orientation&1){
3818
                    b->buf += (w+1)>>1;
3819
                    b->buf_x_offset = (w+1)>>1;
3820
                }
3821
                if(orientation>1){
3822
                    b->buf += b->stride>>1;
3823
                    b->buf_y_offset = b->stride_line >> 1;
3824
                }
3825

    
3826
                if(level)
3827
                    b->parent= &s->plane[plane_index].band[level-1][orientation];
3828
                b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3829
            }
3830
            w= (w+1)>>1;
3831
            h= (h+1)>>1;
3832
        }
3833
    }
3834

    
3835
    for(i=0; i<MAX_REF_FRAMES; i++)
3836
        for(j=0; j<MAX_REF_FRAMES; j++)
3837
            scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3838

    
3839
    reset_contexts(s);
3840
/*
3841
    width= s->width= avctx->width;
3842
    height= s->height= avctx->height;
3843

3844
    assert(width && height);
3845
*/
3846
    s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3847

    
3848
    return 0;
3849
}
3850

    
3851
static int qscale2qlog(int qscale){
3852
    return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3853
           + 61*QROOT/8; //<64 >60
3854
}
3855

    
3856
static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3857
{
3858
    /* estimate the frame's complexity as a sum of weighted dwt coefs.
3859
     * FIXME we know exact mv bits at this point,
3860
     * but ratecontrol isn't set up to include them. */
3861
    uint32_t coef_sum= 0;
3862
    int level, orientation, delta_qlog;
3863

    
3864
    for(level=0; level<s->spatial_decomposition_count; level++){
3865
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3866
            SubBand *b= &s->plane[0].band[level][orientation];
3867
            DWTELEM *buf= b->buf;
3868
            const int w= b->width;
3869
            const int h= b->height;
3870
            const int stride= b->stride;
3871
            const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3872
            const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3873
            const int qdiv= (1<<16)/qmul;
3874
            int x, y;
3875
            if(orientation==0)
3876
                decorrelate(s, b, buf, stride, 1, 0);
3877
            for(y=0; y<h; y++)
3878
                for(x=0; x<w; x++)
3879
                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3880
            if(orientation==0)
3881
                correlate(s, b, buf, stride, 1, 0);
3882
        }
3883
    }
3884

    
3885
    /* ugly, ratecontrol just takes a sqrt again */
3886
    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3887
    assert(coef_sum < INT_MAX);
3888

    
3889
    if(pict->pict_type == I_TYPE){
3890
        s->m.current_picture.mb_var_sum= coef_sum;
3891
        s->m.current_picture.mc_mb_var_sum= 0;
3892
    }else{
3893
        s->m.current_picture.mc_mb_var_sum= coef_sum;
3894
        s->m.current_picture.mb_var_sum= 0;
3895
    }
3896

    
3897
    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3898
    if (pict->quality < 0)
3899
        return INT_MIN;
3900
    s->lambda= pict->quality * 3/2;
3901
    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3902
    s->qlog+= delta_qlog;
3903
    return delta_qlog;
3904
}
3905

    
3906
static void calculate_vissual_weight(SnowContext *s, Plane *p){
3907
    int width = p->width;
3908
    int height= p->height;
3909
    int level, orientation, x, y;
3910

    
3911
    for(level=0; level<s->spatial_decomposition_count; level++){
3912
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3913
            SubBand *b= &p->band[level][orientation];
3914
            DWTELEM *buf= b->buf;
3915
            int64_t error=0;
3916

    
3917
            memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3918
            buf[b->width/2 + b->height/2*b->stride]= 256*256;
3919
            ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3920
            for(y=0; y<height; y++){
3921
                for(x=0; x<width; x++){
3922
                    int64_t d= s->spatial_dwt_buffer[x + y*width];
3923
                    error += d*d;
3924
                }
3925
            }
3926

    
3927
            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3928
//            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3929
        }
3930
    }
3931
}
3932

    
3933
static int encode_init(AVCodecContext *avctx)
3934
{
3935
    SnowContext *s = avctx->priv_data;
3936
    int plane_index;
3937

    
3938
    if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3939
        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3940
               "use vstrict=-2 / -strict -2 to use it anyway\n");
3941
        return -1;
3942
    }
3943

    
3944
    if(avctx->prediction_method == DWT_97
3945
       && (avctx->flags & CODEC_FLAG_QSCALE)
3946
       && avctx->global_quality == 0){
3947
        av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
3948
        return -1;
3949
    }
3950

    
3951
    common_init(avctx);
3952
    alloc_blocks(s);
3953

    
3954
    s->version=0;
3955

    
3956
    s->m.avctx   = avctx;
3957
    s->m.flags   = avctx->flags;
3958
    s->m.bit_rate= avctx->bit_rate;
3959

    
3960
    s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3961
    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3962
    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3963
    s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3964
    h263_encode_init(&s->m); //mv_penalty
3965

    
3966
    s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
3967

    
3968
    if(avctx->flags&CODEC_FLAG_PASS1){
3969
        if(!avctx->stats_out)
3970
            avctx->stats_out = av_mallocz(256);
3971
    }
3972
    if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
3973
        if(ff_rate_control_init(&s->m) < 0)
3974
            return -1;
3975
    }
3976
    s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
3977

    
3978
    for(plane_index=0; plane_index<3; plane_index++){
3979
        calculate_vissual_weight(s, &s->plane[plane_index]);
3980
    }
3981

    
3982

    
3983
    avctx->coded_frame= &s->current_picture;
3984
    switch(avctx->pix_fmt){
3985
//    case PIX_FMT_YUV444P:
3986
//    case PIX_FMT_YUV422P:
3987
    case PIX_FMT_YUV420P:
3988
    case PIX_FMT_GRAY8:
3989
//    case PIX_FMT_YUV411P:
3990
//    case PIX_FMT_YUV410P:
3991
        s->colorspace_type= 0;
3992
        break;
3993
/*    case PIX_FMT_RGB32:
3994
        s->colorspace= 1;
3995
        break;*/
3996
    default:
3997
        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
3998
        return -1;
3999
    }
4000
//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
4001
    s->chroma_h_shift= 1;
4002
    s->chroma_v_shift= 1;
4003

    
4004
    ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4005
    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4006

    
4007
    s->avctx->get_buffer(s->avctx, &s->input_picture);
4008

    
4009
    if(s->avctx->me_method == ME_ITER){
4010
        int i;
4011
        int size= s->b_width * s->b_height << 2*s->block_max_depth;
4012
        for(i=0; i<s->max_ref_frames; i++){
4013
            s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4014
            s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4015
        }
4016
    }
4017

    
4018
    return 0;
4019
}
4020

    
4021
static int frame_start(SnowContext *s){
4022
   AVFrame tmp;
4023
   int w= s->avctx->width; //FIXME round up to x16 ?
4024
   int h= s->avctx->height;
4025

    
4026
    if(s->current_picture.data[0]){
4027
        draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4028
        draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4029
        draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4030
    }
4031

    
4032
    tmp= s->last_picture[s->max_ref_frames-1];
4033
    memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4034
    s->last_picture[0]= s->current_picture;
4035
    s->current_picture= tmp;
4036

    
4037
    if(s->keyframe){
4038
        s->ref_frames= 0;
4039
    }else{
4040
        int i;
4041
        for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4042
            if(i && s->last_p