Statistics
| Branch: | Revision:

ffmpeg / libavcodec / snow.c @ f66e4f5f

History | View | Annotate | Download (168 KB)

1
/*
2
 * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "avcodec.h"
22
#include "common.h"
23
#include "dsputil.h"
24
#include "snow.h"
25

    
26
#include "rangecoder.h"
27

    
28
#include "mpegvideo.h"
29

    
30
#undef NDEBUG
31
#include <assert.h>
32

    
33
static const int8_t quant3[256]={
34
 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
43
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
44
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
45
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
46
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
47
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
48
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
49
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
50
};
51
static const int8_t quant3b[256]={
52
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
61
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
62
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
63
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
64
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
65
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
66
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
67
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
68
};
69
static const int8_t quant3bA[256]={
70
 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
71
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
72
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
73
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
74
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
75
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
76
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
77
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
78
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
79
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
80
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
81
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
82
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
83
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
84
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
85
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
86
};
87
static const int8_t quant5[256]={
88
 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
91
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
92
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
93
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
94
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
95
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
96
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
97
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
98
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
99
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
100
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
101
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
102
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
103
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
104
};
105
static const int8_t quant7[256]={
106
 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
108
 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
109
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
110
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
111
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
112
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
113
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
114
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
115
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
116
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
117
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
118
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
119
-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
120
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
121
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
122
};
123
static const int8_t quant9[256]={
124
 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
125
 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
126
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
127
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
128
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
129
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
130
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
131
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
132
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
133
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
134
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
135
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
136
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
137
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
138
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
139
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
140
};
141
static const int8_t quant11[256]={
142
 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
143
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
144
 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
145
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
146
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
147
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
148
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
149
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
150
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
151
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
152
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
153
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
154
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
155
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
156
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
157
-4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
158
};
159
static const int8_t quant13[256]={
160
 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
161
 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
162
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
163
 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
164
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
165
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
166
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
167
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
168
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
169
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
170
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
171
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
172
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
173
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
174
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
175
-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
176
};
177

    
178
#if 0 //64*cubic
179
static const uint8_t obmc32[1024]={
180
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
182
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
183
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
184
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
185
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
186
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
187
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
188
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
189
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
190
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
191
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
192
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
193
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
194
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
195
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
196
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
197
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
198
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
199
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
200
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
201
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
202
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
203
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
204
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
205
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
206
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
207
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
208
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
209
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
210
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
211
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212
//error:0.000022
213
};
214
static const uint8_t obmc16[256]={
215
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
216
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
217
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
218
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
219
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
220
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
221
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
222
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
223
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
224
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
225
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
226
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
227
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
228
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
229
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
230
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
231
//error:0.000033
232
};
233
#elif 1 // 64*linear
234
static const uint8_t obmc32[1024]={
235
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
236
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
237
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
238
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
239
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
240
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
241
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
242
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
243
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
244
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
245
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
246
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
247
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
248
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
249
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
250
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
251
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
252
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
253
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
254
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
255
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
256
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
257
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
258
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
259
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
260
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
261
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
262
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
263
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
264
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
265
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
266
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
267
 //error:0.000020
268
};
269
static const uint8_t obmc16[256]={
270
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
271
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
272
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
273
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
274
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
275
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
276
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
277
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
278
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
279
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
280
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
281
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
282
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
283
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
284
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
285
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
286
//error:0.000015
287
};
288
#else //64*cos
289
static const uint8_t obmc32[1024]={
290
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
291
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
292
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
293
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
294
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
295
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
296
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
297
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
298
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
299
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
300
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
301
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
302
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
303
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
304
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
305
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
306
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
307
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
308
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
309
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
310
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
311
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
312
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
313
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
314
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
315
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
316
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
317
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
318
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
319
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
320
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
321
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
322
//error:0.000022
323
};
324
static const uint8_t obmc16[256]={
325
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
326
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
327
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
328
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
329
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
330
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
331
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
332
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
333
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
334
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
335
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
336
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
337
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
338
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
339
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
340
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
341
//error:0.000022
342
};
343
#endif
344

    
345
//linear *64
346
static const uint8_t obmc8[64]={
347
  4, 12, 20, 28, 28, 20, 12,  4,
348
 12, 36, 60, 84, 84, 60, 36, 12,
349
 20, 60,100,140,140,100, 60, 20,
350
 28, 84,140,196,196,140, 84, 28,
351
 28, 84,140,196,196,140, 84, 28,
352
 20, 60,100,140,140,100, 60, 20,
353
 12, 36, 60, 84, 84, 60, 36, 12,
354
  4, 12, 20, 28, 28, 20, 12,  4,
355
//error:0.000000
356
};
357

    
358
//linear *64
359
static const uint8_t obmc4[16]={
360
 16, 48, 48, 16,
361
 48,144,144, 48,
362
 48,144,144, 48,
363
 16, 48, 48, 16,
364
//error:0.000000
365
};
366

    
367
static const uint8_t *obmc_tab[4]={
368
    obmc32, obmc16, obmc8, obmc4
369
};
370

    
371
static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
372

    
373
typedef struct BlockNode{
374
    int16_t mx;
375
    int16_t my;
376
    uint8_t ref;
377
    uint8_t color[3];
378
    uint8_t type;
379
//#define TYPE_SPLIT    1
380
#define BLOCK_INTRA   1
381
#define BLOCK_OPT     2
382
//#define TYPE_NOCOLOR  4
383
    uint8_t level; //FIXME merge into type?
384
}BlockNode;
385

    
386
static const BlockNode null_block= { //FIXME add border maybe
387
    .color= {128,128,128},
388
    .mx= 0,
389
    .my= 0,
390
    .ref= 0,
391
    .type= 0,
392
    .level= 0,
393
};
394

    
395
#define LOG2_MB_SIZE 4
396
#define MB_SIZE (1<<LOG2_MB_SIZE)
397

    
398
typedef struct x_and_coeff{
399
    int16_t x;
400
    uint16_t coeff;
401
} x_and_coeff;
402

    
403
typedef struct SubBand{
404
    int level;
405
    int stride;
406
    int width;
407
    int height;
408
    int qlog;                                   ///< log(qscale)/log[2^(1/6)]
409
    DWTELEM *buf;
410
    int buf_x_offset;
411
    int buf_y_offset;
412
    int stride_line; ///< Stride measured in lines, not pixels.
413
    x_and_coeff * x_coeff;
414
    struct SubBand *parent;
415
    uint8_t state[/*7*2*/ 7 + 512][32];
416
}SubBand;
417

    
418
typedef struct Plane{
419
    int width;
420
    int height;
421
    SubBand band[MAX_DECOMPOSITIONS][4];
422
}Plane;
423

    
424
typedef struct SnowContext{
425
//    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
426

    
427
    AVCodecContext *avctx;
428
    RangeCoder c;
429
    DSPContext dsp;
430
    AVFrame new_picture;
431
    AVFrame input_picture;              ///< new_picture with the internal linesizes
432
    AVFrame current_picture;
433
    AVFrame last_picture[MAX_REF_FRAMES];
434
    AVFrame mconly_picture;
435
//     uint8_t q_context[16];
436
    uint8_t header_state[32];
437
    uint8_t block_state[128 + 32*128];
438
    int keyframe;
439
    int always_reset;
440
    int version;
441
    int spatial_decomposition_type;
442
    int last_spatial_decomposition_type;
443
    int temporal_decomposition_type;
444
    int spatial_decomposition_count;
445
    int temporal_decomposition_count;
446
    int max_ref_frames;
447
    int ref_frames;
448
    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
449
    uint32_t *ref_scores[MAX_REF_FRAMES];
450
    DWTELEM *spatial_dwt_buffer;
451
    int colorspace_type;
452
    int chroma_h_shift;
453
    int chroma_v_shift;
454
    int spatial_scalability;
455
    int qlog;
456
    int last_qlog;
457
    int lambda;
458
    int lambda2;
459
    int pass1_rc;
460
    int mv_scale;
461
    int last_mv_scale;
462
    int qbias;
463
    int last_qbias;
464
#define QBIAS_SHIFT 3
465
    int b_width;
466
    int b_height;
467
    int block_max_depth;
468
    int last_block_max_depth;
469
    Plane plane[MAX_PLANES];
470
    BlockNode *block;
471
#define ME_CACHE_SIZE 1024
472
    int me_cache[ME_CACHE_SIZE];
473
    int me_cache_generation;
474
    slice_buffer sb;
475

    
476
    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
477
}SnowContext;
478

    
479
typedef struct {
480
    DWTELEM *b0;
481
    DWTELEM *b1;
482
    DWTELEM *b2;
483
    DWTELEM *b3;
484
    int y;
485
} dwt_compose_t;
486

    
487
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
488
//#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
489

    
490
static void iterative_me(SnowContext *s);
491

    
492
static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
493
{
494
    int i;
495

    
496
    buf->base_buffer = base_buffer;
497
    buf->line_count = line_count;
498
    buf->line_width = line_width;
499
    buf->data_count = max_allocated_lines;
500
    buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
501
    buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
502

    
503
    for (i = 0; i < max_allocated_lines; i++)
504
    {
505
      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
506
    }
507

    
508
    buf->data_stack_top = max_allocated_lines - 1;
509
}
510

    
511
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
512
{
513
    int offset;
514
    DWTELEM * buffer;
515

    
516
//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
517

    
518
    assert(buf->data_stack_top >= 0);
519
//  assert(!buf->line[line]);
520
    if (buf->line[line])
521
        return buf->line[line];
522

    
523
    offset = buf->line_width * line;
524
    buffer = buf->data_stack[buf->data_stack_top];
525
    buf->data_stack_top--;
526
    buf->line[line] = buffer;
527

    
528
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
529

    
530
    return buffer;
531
}
532

    
533
static void slice_buffer_release(slice_buffer * buf, int line)
534
{
535
    int offset;
536
    DWTELEM * buffer;
537

    
538
    assert(line >= 0 && line < buf->line_count);
539
    assert(buf->line[line]);
540

    
541
    offset = buf->line_width * line;
542
    buffer = buf->line[line];
543
    buf->data_stack_top++;
544
    buf->data_stack[buf->data_stack_top] = buffer;
545
    buf->line[line] = NULL;
546

    
547
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
548
}
549

    
550
static void slice_buffer_flush(slice_buffer * buf)
551
{
552
    int i;
553
    for (i = 0; i < buf->line_count; i++)
554
    {
555
        if (buf->line[i])
556
        {
557
//      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
558
            slice_buffer_release(buf, i);
559
        }
560
    }
561
}
562

    
563
static void slice_buffer_destroy(slice_buffer * buf)
564
{
565
    int i;
566
    slice_buffer_flush(buf);
567

    
568
    for (i = buf->data_count - 1; i >= 0; i--)
569
    {
570
        assert(buf->data_stack[i]);
571
        av_freep(&buf->data_stack[i]);
572
    }
573
    assert(buf->data_stack);
574
    av_freep(&buf->data_stack);
575
    assert(buf->line);
576
    av_freep(&buf->line);
577
}
578

    
579
#ifdef __sgi
580
// Avoid a name clash on SGI IRIX
581
#undef qexp
582
#endif
583
#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
584
static uint8_t qexp[QROOT];
585

    
586
static inline int mirror(int v, int m){
587
    while((unsigned)v > (unsigned)m){
588
        v=-v;
589
        if(v<0) v+= 2*m;
590
    }
591
    return v;
592
}
593

    
594
static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
595
    int i;
596

    
597
    if(v){
598
        const int a= FFABS(v);
599
        const int e= av_log2(a);
600
#if 1
601
        const int el= FFMIN(e, 10);
602
        put_rac(c, state+0, 0);
603

    
604
        for(i=0; i<el; i++){
605
            put_rac(c, state+1+i, 1);  //1..10
606
        }
607
        for(; i<e; i++){
608
            put_rac(c, state+1+9, 1);  //1..10
609
        }
610
        put_rac(c, state+1+FFMIN(i,9), 0);
611

    
612
        for(i=e-1; i>=el; i--){
613
            put_rac(c, state+22+9, (a>>i)&1); //22..31
614
        }
615
        for(; i>=0; i--){
616
            put_rac(c, state+22+i, (a>>i)&1); //22..31
617
        }
618

    
619
        if(is_signed)
620
            put_rac(c, state+11 + el, v < 0); //11..21
621
#else
622

    
623
        put_rac(c, state+0, 0);
624
        if(e<=9){
625
            for(i=0; i<e; i++){
626
                put_rac(c, state+1+i, 1);  //1..10
627
            }
628
            put_rac(c, state+1+i, 0);
629

    
630
            for(i=e-1; i>=0; i--){
631
                put_rac(c, state+22+i, (a>>i)&1); //22..31
632
            }
633

    
634
            if(is_signed)
635
                put_rac(c, state+11 + e, v < 0); //11..21
636
        }else{
637
            for(i=0; i<e; i++){
638
                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
639
            }
640
            put_rac(c, state+1+FFMIN(i,9), 0);
641

    
642
            for(i=e-1; i>=0; i--){
643
                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
644
            }
645

    
646
            if(is_signed)
647
                put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
648
        }
649
#endif
650
    }else{
651
        put_rac(c, state+0, 1);
652
    }
653
}
654

    
655
static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
656
    if(get_rac(c, state+0))
657
        return 0;
658
    else{
659
        int i, e, a;
660
        e= 0;
661
        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
662
            e++;
663
        }
664

    
665
        a= 1;
666
        for(i=e-1; i>=0; i--){
667
            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
668
        }
669

    
670
        if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
671
            return -a;
672
        else
673
            return a;
674
    }
675
}
676

    
677
static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
678
    int i;
679
    int r= log2>=0 ? 1<<log2 : 1;
680

    
681
    assert(v>=0);
682
    assert(log2>=-4);
683

    
684
    while(v >= r){
685
        put_rac(c, state+4+log2, 1);
686
        v -= r;
687
        log2++;
688
        if(log2>0) r+=r;
689
    }
690
    put_rac(c, state+4+log2, 0);
691

    
692
    for(i=log2-1; i>=0; i--){
693
        put_rac(c, state+31-i, (v>>i)&1);
694
    }
695
}
696

    
697
static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
698
    int i;
699
    int r= log2>=0 ? 1<<log2 : 1;
700
    int v=0;
701

    
702
    assert(log2>=-4);
703

    
704
    while(get_rac(c, state+4+log2)){
705
        v+= r;
706
        log2++;
707
        if(log2>0) r+=r;
708
    }
709

    
710
    for(i=log2-1; i>=0; i--){
711
        v+= get_rac(c, state+31-i)<<i;
712
    }
713

    
714
    return v;
715
}
716

    
717
static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
718
    const int mirror_left= !highpass;
719
    const int mirror_right= (width&1) ^ highpass;
720
    const int w= (width>>1) - 1 + (highpass & width);
721
    int i;
722

    
723
#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
724
    if(mirror_left){
725
        dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
726
        dst += dst_step;
727
        src += src_step;
728
    }
729

    
730
    for(i=0; i<w; i++){
731
        dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
732
    }
733

    
734
    if(mirror_right){
735
        dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
736
    }
737
}
738

    
739
#ifndef lift5
740
static av_always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
741
    const int mirror_left= !highpass;
742
    const int mirror_right= (width&1) ^ highpass;
743
    const int w= (width>>1) - 1 + (highpass & width);
744
    int i;
745

    
746
    if(mirror_left){
747
        int r= 3*2*ref[0];
748
        r += r>>4;
749
        r += r>>8;
750
        dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
751
        dst += dst_step;
752
        src += src_step;
753
    }
754

    
755
    for(i=0; i<w; i++){
756
        int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
757
        r += r>>4;
758
        r += r>>8;
759
        dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
760
    }
761

    
762
    if(mirror_right){
763
        int r= 3*2*ref[w*ref_step];
764
        r += r>>4;
765
        r += r>>8;
766
        dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
767
    }
768
}
769
#endif
770

    
771
#ifndef liftS
772
static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
773
    const int mirror_left= !highpass;
774
    const int mirror_right= (width&1) ^ highpass;
775
    const int w= (width>>1) - 1 + (highpass & width);
776
    int i;
777

    
778
    assert(shift == 4);
779
#define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
780
    if(mirror_left){
781
        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
782
        dst += dst_step;
783
        src += src_step;
784
    }
785

    
786
    for(i=0; i<w; i++){
787
        dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
788
    }
789

    
790
    if(mirror_right){
791
        dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
792
    }
793
}
794
#endif
795

    
796

    
797
static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
798
    int x, i;
799

    
800
    for(x=start; x<width; x+=2){
801
        int64_t sum=0;
802

    
803
        for(i=0; i<n; i++){
804
            int x2= x + 2*i - n + 1;
805
            if     (x2<     0) x2= -x2;
806
            else if(x2>=width) x2= 2*width-x2-2;
807
            sum += coeffs[i]*(int64_t)dst[x2];
808
        }
809
        if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
810
        else        dst[x] += (sum + (1<<shift)/2)>>shift;
811
    }
812
}
813

    
814
static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
815
    int x, y, i;
816
    for(y=start; y<height; y+=2){
817
        for(x=0; x<width; x++){
818
            int64_t sum=0;
819

    
820
            for(i=0; i<n; i++){
821
                int y2= y + 2*i - n + 1;
822
                if     (y2<      0) y2= -y2;
823
                else if(y2>=height) y2= 2*height-y2-2;
824
                sum += coeffs[i]*(int64_t)dst[x + y2*stride];
825
            }
826
            if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
827
            else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
828
        }
829
    }
830
}
831

    
832
#define SCALEX 1
833
#define LX0 0
834
#define LX1 1
835

    
836
#if 0 // more accurate 9/7
837
#define N1 2
838
#define SHIFT1 14
839
#define COEFFS1 (int[]){-25987,-25987}
840
#define N2 2
841
#define SHIFT2 19
842
#define COEFFS2 (int[]){-27777,-27777}
843
#define N3 2
844
#define SHIFT3 15
845
#define COEFFS3 (int[]){28931,28931}
846
#define N4 2
847
#define SHIFT4 15
848
#define COEFFS4 (int[]){14533,14533}
849
#elif 1 // 13/7 CRF
850
#define N1 4
851
#define SHIFT1 4
852
#define COEFFS1 (int[]){1,-9,-9,1}
853
#define N2 4
854
#define SHIFT2 4
855
#define COEFFS2 (int[]){-1,5,5,-1}
856
#define N3 0
857
#define SHIFT3 1
858
#define COEFFS3 NULL
859
#define N4 0
860
#define SHIFT4 1
861
#define COEFFS4 NULL
862
#elif 1 // 3/5
863
#define LX0 1
864
#define LX1 0
865
#define SCALEX 0.5
866
#define N1 2
867
#define SHIFT1 1
868
#define COEFFS1 (int[]){1,1}
869
#define N2 2
870
#define SHIFT2 2
871
#define COEFFS2 (int[]){-1,-1}
872
#define N3 0
873
#define SHIFT3 0
874
#define COEFFS3 NULL
875
#define N4 0
876
#define SHIFT4 0
877
#define COEFFS4 NULL
878
#elif 1 // 11/5
879
#define N1 0
880
#define SHIFT1 1
881
#define COEFFS1 NULL
882
#define N2 2
883
#define SHIFT2 2
884
#define COEFFS2 (int[]){-1,-1}
885
#define N3 2
886
#define SHIFT3 0
887
#define COEFFS3 (int[]){-1,-1}
888
#define N4 4
889
#define SHIFT4 7
890
#define COEFFS4 (int[]){-5,29,29,-5}
891
#define SCALEX 4
892
#elif 1 // 9/7 CDF
893
#define N1 2
894
#define SHIFT1 7
895
#define COEFFS1 (int[]){-203,-203}
896
#define N2 2
897
#define SHIFT2 12
898
#define COEFFS2 (int[]){-217,-217}
899
#define N3 2
900
#define SHIFT3 7
901
#define COEFFS3 (int[]){113,113}
902
#define N4 2
903
#define SHIFT4 9
904
#define COEFFS4 (int[]){227,227}
905
#define SCALEX 1
906
#elif 1 // 7/5 CDF
907
#define N1 0
908
#define SHIFT1 1
909
#define COEFFS1 NULL
910
#define N2 2
911
#define SHIFT2 2
912
#define COEFFS2 (int[]){-1,-1}
913
#define N3 2
914
#define SHIFT3 0
915
#define COEFFS3 (int[]){-1,-1}
916
#define N4 2
917
#define SHIFT4 4
918
#define COEFFS4 (int[]){3,3}
919
#elif 1 // 9/7 MN
920
#define N1 4
921
#define SHIFT1 4
922
#define COEFFS1 (int[]){1,-9,-9,1}
923
#define N2 2
924
#define SHIFT2 2
925
#define COEFFS2 (int[]){1,1}
926
#define N3 0
927
#define SHIFT3 1
928
#define COEFFS3 NULL
929
#define N4 0
930
#define SHIFT4 1
931
#define COEFFS4 NULL
932
#else // 13/7 CRF
933
#define N1 4
934
#define SHIFT1 4
935
#define COEFFS1 (int[]){1,-9,-9,1}
936
#define N2 4
937
#define SHIFT2 4
938
#define COEFFS2 (int[]){-1,5,5,-1}
939
#define N3 0
940
#define SHIFT3 1
941
#define COEFFS3 NULL
942
#define N4 0
943
#define SHIFT4 1
944
#define COEFFS4 NULL
945
#endif
946
static void horizontal_decomposeX(DWTELEM *b, int width){
947
    DWTELEM temp[width];
948
    const int width2= width>>1;
949
    const int w2= (width+1)>>1;
950
    int x;
951

    
952
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
953
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
954
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
955
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
956

    
957
    for(x=0; x<width2; x++){
958
        temp[x   ]= b[2*x    ];
959
        temp[x+w2]= b[2*x + 1];
960
    }
961
    if(width&1)
962
        temp[x   ]= b[2*x    ];
963
    memcpy(b, temp, width*sizeof(int));
964
}
965

    
966
static void horizontal_composeX(DWTELEM *b, int width){
967
    DWTELEM temp[width];
968
    const int width2= width>>1;
969
    int x;
970
    const int w2= (width+1)>>1;
971

    
972
    memcpy(temp, b, width*sizeof(int));
973
    for(x=0; x<width2; x++){
974
        b[2*x    ]= temp[x   ];
975
        b[2*x + 1]= temp[x+w2];
976
    }
977
    if(width&1)
978
        b[2*x    ]= temp[x   ];
979

    
980
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
981
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
982
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
983
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
984
}
985

    
986
static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
987
    int x, y;
988

    
989
    for(y=0; y<height; y++){
990
        for(x=0; x<width; x++){
991
            buffer[y*stride + x] *= SCALEX;
992
        }
993
    }
994

    
995
    for(y=0; y<height; y++){
996
        horizontal_decomposeX(buffer + y*stride, width);
997
    }
998

    
999
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
1000
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
1001
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
1002
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
1003
}
1004

    
1005
static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
1006
    int x, y;
1007

    
1008
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1009
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1010
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1011
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1012

    
1013
    for(y=0; y<height; y++){
1014
        horizontal_composeX(buffer + y*stride, width);
1015
    }
1016

    
1017
    for(y=0; y<height; y++){
1018
        for(x=0; x<width; x++){
1019
            buffer[y*stride + x] /= SCALEX;
1020
        }
1021
    }
1022
}
1023

    
1024
static void horizontal_decompose53i(DWTELEM *b, int width){
1025
    DWTELEM temp[width];
1026
    const int width2= width>>1;
1027
    int x;
1028
    const int w2= (width+1)>>1;
1029

    
1030
    for(x=0; x<width2; x++){
1031
        temp[x   ]= b[2*x    ];
1032
        temp[x+w2]= b[2*x + 1];
1033
    }
1034
    if(width&1)
1035
        temp[x   ]= b[2*x    ];
1036
#if 0
1037
    {
1038
    int A1,A2,A3,A4;
1039
    A2= temp[1       ];
1040
    A4= temp[0       ];
1041
    A1= temp[0+width2];
1042
    A1 -= (A2 + A4)>>1;
1043
    A4 += (A1 + 1)>>1;
1044
    b[0+width2] = A1;
1045
    b[0       ] = A4;
1046
    for(x=1; x+1<width2; x+=2){
1047
        A3= temp[x+width2];
1048
        A4= temp[x+1     ];
1049
        A3 -= (A2 + A4)>>1;
1050
        A2 += (A1 + A3 + 2)>>2;
1051
        b[x+width2] = A3;
1052
        b[x       ] = A2;
1053

1054
        A1= temp[x+1+width2];
1055
        A2= temp[x+2       ];
1056
        A1 -= (A2 + A4)>>1;
1057
        A4 += (A1 + A3 + 2)>>2;
1058
        b[x+1+width2] = A1;
1059
        b[x+1       ] = A4;
1060
    }
1061
    A3= temp[width-1];
1062
    A3 -= A2;
1063
    A2 += (A1 + A3 + 2)>>2;
1064
    b[width -1] = A3;
1065
    b[width2-1] = A2;
1066
    }
1067
#else
1068
    lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1069
    lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1070
#endif
1071
}
1072

    
1073
static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1074
    int i;
1075

    
1076
    for(i=0; i<width; i++){
1077
        b1[i] -= (b0[i] + b2[i])>>1;
1078
    }
1079
}
1080

    
1081
static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1082
    int i;
1083

    
1084
    for(i=0; i<width; i++){
1085
        b1[i] += (b0[i] + b2[i] + 2)>>2;
1086
    }
1087
}
1088

    
1089
static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1090
    int y;
1091
    DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1092
    DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1093

    
1094
    for(y=-2; y<height; y+=2){
1095
        DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1096
        DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1097

    
1098
{START_TIMER
1099
        if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1100
        if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1101
STOP_TIMER("horizontal_decompose53i")}
1102

    
1103
{START_TIMER
1104
        if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1105
        if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1106
STOP_TIMER("vertical_decompose53i*")}
1107

    
1108
        b0=b2;
1109
        b1=b3;
1110
    }
1111
}
1112

    
1113
static void horizontal_decompose97i(DWTELEM *b, int width){
1114
    DWTELEM temp[width];
1115
    const int w2= (width+1)>>1;
1116

    
1117
    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1118
    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1119
    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1120
    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1121
}
1122

    
1123

    
1124
static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1125
    int i;
1126

    
1127
    for(i=0; i<width; i++){
1128
        b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1129
    }
1130
}
1131

    
1132
static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1133
    int i;
1134

    
1135
    for(i=0; i<width; i++){
1136
#ifdef lift5
1137
        b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1138
#else
1139
        int r= 3*(b0[i] + b2[i]);
1140
        r+= r>>4;
1141
        r+= r>>8;
1142
        b1[i] += (r+W_CO)>>W_CS;
1143
#endif
1144
    }
1145
}
1146

    
1147
static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1148
    int i;
1149

    
1150
    for(i=0; i<width; i++){
1151
#ifdef liftS
1152
        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1153
#else
1154
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1155
#endif
1156
    }
1157
}
1158

    
1159
static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1160
    int i;
1161

    
1162
    for(i=0; i<width; i++){
1163
        b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1164
    }
1165
}
1166

    
1167
static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1168
    int y;
1169
    DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1170
    DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1171
    DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1172
    DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1173

    
1174
    for(y=-4; y<height; y+=2){
1175
        DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1176
        DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1177

    
1178
{START_TIMER
1179
        if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1180
        if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1181
if(width>400){
1182
STOP_TIMER("horizontal_decompose97i")
1183
}}
1184

    
1185
{START_TIMER
1186
        if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1187
        if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1188
        if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1189
        if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1190

    
1191
if(width>400){
1192
STOP_TIMER("vertical_decompose97i")
1193
}}
1194

    
1195
        b0=b2;
1196
        b1=b3;
1197
        b2=b4;
1198
        b3=b5;
1199
    }
1200
}
1201

    
1202
void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1203
    int level;
1204

    
1205
    for(level=0; level<decomposition_count; level++){
1206
        switch(type){
1207
        case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1208
        case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1209
        case DWT_X: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1210
        }
1211
    }
1212
}
1213

    
1214
static void horizontal_compose53i(DWTELEM *b, int width){
1215
    DWTELEM temp[width];
1216
    const int width2= width>>1;
1217
    const int w2= (width+1)>>1;
1218
    int x;
1219

    
1220
#if 0
1221
    int A1,A2,A3,A4;
1222
    A2= temp[1       ];
1223
    A4= temp[0       ];
1224
    A1= temp[0+width2];
1225
    A1 -= (A2 + A4)>>1;
1226
    A4 += (A1 + 1)>>1;
1227
    b[0+width2] = A1;
1228
    b[0       ] = A4;
1229
    for(x=1; x+1<width2; x+=2){
1230
        A3= temp[x+width2];
1231
        A4= temp[x+1     ];
1232
        A3 -= (A2 + A4)>>1;
1233
        A2 += (A1 + A3 + 2)>>2;
1234
        b[x+width2] = A3;
1235
        b[x       ] = A2;
1236

1237
        A1= temp[x+1+width2];
1238
        A2= temp[x+2       ];
1239
        A1 -= (A2 + A4)>>1;
1240
        A4 += (A1 + A3 + 2)>>2;
1241
        b[x+1+width2] = A1;
1242
        b[x+1       ] = A4;
1243
    }
1244
    A3= temp[width-1];
1245
    A3 -= A2;
1246
    A2 += (A1 + A3 + 2)>>2;
1247
    b[width -1] = A3;
1248
    b[width2-1] = A2;
1249
#else
1250
    lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1251
    lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1252
#endif
1253
    for(x=0; x<width2; x++){
1254
        b[2*x    ]= temp[x   ];
1255
        b[2*x + 1]= temp[x+w2];
1256
    }
1257
    if(width&1)
1258
        b[2*x    ]= temp[x   ];
1259
}
1260

    
1261
static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1262
    int i;
1263

    
1264
    for(i=0; i<width; i++){
1265
        b1[i] += (b0[i] + b2[i])>>1;
1266
    }
1267
}
1268

    
1269
static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1270
    int i;
1271

    
1272
    for(i=0; i<width; i++){
1273
        b1[i] -= (b0[i] + b2[i] + 2)>>2;
1274
    }
1275
}
1276

    
1277
static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1278
    cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1279
    cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1280
    cs->y = -1;
1281
}
1282

    
1283
static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1284
    cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1285
    cs->b1 = buffer + mirror(-1  , height-1)*stride;
1286
    cs->y = -1;
1287
}
1288

    
1289
static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1290
    int y= cs->y;
1291

    
1292
    DWTELEM *b0= cs->b0;
1293
    DWTELEM *b1= cs->b1;
1294
    DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1295
    DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1296

    
1297
{START_TIMER
1298
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1299
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1300
STOP_TIMER("vertical_compose53i*")}
1301

    
1302
{START_TIMER
1303
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1304
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1305
STOP_TIMER("horizontal_compose53i")}
1306

    
1307
    cs->b0 = b2;
1308
    cs->b1 = b3;
1309
    cs->y += 2;
1310
}
1311

    
1312
static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1313
    int y= cs->y;
1314
    DWTELEM *b0= cs->b0;
1315
    DWTELEM *b1= cs->b1;
1316
    DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1317
    DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1318

    
1319
{START_TIMER
1320
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1321
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1322
STOP_TIMER("vertical_compose53i*")}
1323

    
1324
{START_TIMER
1325
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1326
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1327
STOP_TIMER("horizontal_compose53i")}
1328

    
1329
    cs->b0 = b2;
1330
    cs->b1 = b3;
1331
    cs->y += 2;
1332
}
1333

    
1334
static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1335
    dwt_compose_t cs;
1336
    spatial_compose53i_init(&cs, buffer, height, stride);
1337
    while(cs.y <= height)
1338
        spatial_compose53i_dy(&cs, buffer, width, height, stride);
1339
}
1340

    
1341

    
1342
void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1343
    DWTELEM temp[width];
1344
    const int w2= (width+1)>>1;
1345

    
1346
    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1347
    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1348
    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1349
    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1350
}
1351

    
1352
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1353
    int i;
1354

    
1355
    for(i=0; i<width; i++){
1356
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1357
    }
1358
}
1359

    
1360
static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1361
    int i;
1362

    
1363
    for(i=0; i<width; i++){
1364
#ifdef lift5
1365
        b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1366
#else
1367
        int r= 3*(b0[i] + b2[i]);
1368
        r+= r>>4;
1369
        r+= r>>8;
1370
        b1[i] -= (r+W_CO)>>W_CS;
1371
#endif
1372
    }
1373
}
1374

    
1375
static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1376
    int i;
1377

    
1378
    for(i=0; i<width; i++){
1379
#ifdef liftS
1380
        b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1381
#else
1382
        b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1383
#endif
1384
    }
1385
}
1386

    
1387
static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1388
    int i;
1389

    
1390
    for(i=0; i<width; i++){
1391
        b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1392
    }
1393
}
1394

    
1395
void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1396
    int i;
1397

    
1398
    for(i=0; i<width; i++){
1399
#ifndef lift5
1400
        int r;
1401
#endif
1402
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1403
#ifdef lift5
1404
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1405
#else
1406
        r= 3*(b2[i] + b4[i]);
1407
        r+= r>>4;
1408
        r+= r>>8;
1409
        b3[i] -= (r+W_CO)>>W_CS;
1410
#endif
1411
#ifdef liftS
1412
        b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1413
#else
1414
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1415
#endif
1416
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1417
    }
1418
}
1419

    
1420
static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1421
    cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1422
    cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1423
    cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1424
    cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1425
    cs->y = -3;
1426
}
1427

    
1428
static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1429
    cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1430
    cs->b1 = buffer + mirror(-3  , height-1)*stride;
1431
    cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1432
    cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1433
    cs->y = -3;
1434
}
1435

    
1436
static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1437
    int y = cs->y;
1438

    
1439
    DWTELEM *b0= cs->b0;
1440
    DWTELEM *b1= cs->b1;
1441
    DWTELEM *b2= cs->b2;
1442
    DWTELEM *b3= cs->b3;
1443
    DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1444
    DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1445

    
1446
{START_TIMER
1447
    if(y>0 && y+4<height){
1448
        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1449
    }else{
1450
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1451
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1452
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1453
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1454
    }
1455
if(width>400){
1456
STOP_TIMER("vertical_compose97i")}}
1457

    
1458
{START_TIMER
1459
        if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1460
        if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1461
if(width>400 && y+0<(unsigned)height){
1462
STOP_TIMER("horizontal_compose97i")}}
1463

    
1464
    cs->b0=b2;
1465
    cs->b1=b3;
1466
    cs->b2=b4;
1467
    cs->b3=b5;
1468
    cs->y += 2;
1469
}
1470

    
1471
static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1472
    int y = cs->y;
1473
    DWTELEM *b0= cs->b0;
1474
    DWTELEM *b1= cs->b1;
1475
    DWTELEM *b2= cs->b2;
1476
    DWTELEM *b3= cs->b3;
1477
    DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1478
    DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1479

    
1480
{START_TIMER
1481
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1482
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1483
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1484
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1485
if(width>400){
1486
STOP_TIMER("vertical_compose97i")}}
1487

    
1488
{START_TIMER
1489
        if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1490
        if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1491
if(width>400 && b0 <= b2){
1492
STOP_TIMER("horizontal_compose97i")}}
1493

    
1494
    cs->b0=b2;
1495
    cs->b1=b3;
1496
    cs->b2=b4;
1497
    cs->b3=b5;
1498
    cs->y += 2;
1499
}
1500

    
1501
static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1502
    dwt_compose_t cs;
1503
    spatial_compose97i_init(&cs, buffer, height, stride);
1504
    while(cs.y <= height)
1505
        spatial_compose97i_dy(&cs, buffer, width, height, stride);
1506
}
1507

    
1508
static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1509
    int level;
1510
    for(level=decomposition_count-1; level>=0; level--){
1511
        switch(type){
1512
        case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1513
        case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1514
        /* not slicified yet */
1515
        case DWT_X: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1516
          av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1517
        }
1518
    }
1519
}
1520

    
1521
static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1522
    int level;
1523
    for(level=decomposition_count-1; level>=0; level--){
1524
        switch(type){
1525
        case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1526
        case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1527
        /* not slicified yet */
1528
        case DWT_X: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1529
        }
1530
    }
1531
}
1532

    
1533
static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1534
    const int support = type==1 ? 3 : 5;
1535
    int level;
1536
    if(type==2) return;
1537

    
1538
    for(level=decomposition_count-1; level>=0; level--){
1539
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1540
            switch(type){
1541
            case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1542
                    break;
1543
            case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1544
                    break;
1545
            case DWT_X: break;
1546
            }
1547
        }
1548
    }
1549
}
1550

    
1551
static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1552
    const int support = type==1 ? 3 : 5;
1553
    int level;
1554
    if(type==2) return;
1555

    
1556
    for(level=decomposition_count-1; level>=0; level--){
1557
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1558
            switch(type){
1559
            case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1560
                    break;
1561
            case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1562
                    break;
1563
            case DWT_X: break;
1564
            }
1565
        }
1566
    }
1567
}
1568

    
1569
static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1570
    if(type==2){
1571
        int level;
1572
        for(level=decomposition_count-1; level>=0; level--)
1573
            spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1574
    }else{
1575
        dwt_compose_t cs[MAX_DECOMPOSITIONS];
1576
        int y;
1577
        ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1578
        for(y=0; y<height; y+=4)
1579
            ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1580
    }
1581
}
1582

    
1583
static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1584
    const int w= b->width;
1585
    const int h= b->height;
1586
    int x, y;
1587

    
1588
    if(1){
1589
        int run=0;
1590
        int runs[w*h];
1591
        int run_index=0;
1592
        int max_index;
1593

    
1594
        for(y=0; y<h; y++){
1595
            for(x=0; x<w; x++){
1596
                int v, p=0;
1597
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1598
                v= src[x + y*stride];
1599

    
1600
                if(y){
1601
                    t= src[x + (y-1)*stride];
1602
                    if(x){
1603
                        lt= src[x - 1 + (y-1)*stride];
1604
                    }
1605
                    if(x + 1 < w){
1606
                        rt= src[x + 1 + (y-1)*stride];
1607
                    }
1608
                }
1609
                if(x){
1610
                    l= src[x - 1 + y*stride];
1611
                    /*if(x > 1){
1612
                        if(orientation==1) ll= src[y + (x-2)*stride];
1613
                        else               ll= src[x - 2 + y*stride];
1614
                    }*/
1615
                }
1616
                if(parent){
1617
                    int px= x>>1;
1618
                    int py= y>>1;
1619
                    if(px<b->parent->width && py<b->parent->height)
1620
                        p= parent[px + py*2*stride];
1621
                }
1622
                if(!(/*ll|*/l|lt|t|rt|p)){
1623
                    if(v){
1624
                        runs[run_index++]= run;
1625
                        run=0;
1626
                    }else{
1627
                        run++;
1628
                    }
1629
                }
1630
            }
1631
        }
1632
        max_index= run_index;
1633
        runs[run_index++]= run;
1634
        run_index=0;
1635
        run= runs[run_index++];
1636

    
1637
        put_symbol2(&s->c, b->state[30], max_index, 0);
1638
        if(run_index <= max_index)
1639
            put_symbol2(&s->c, b->state[1], run, 3);
1640

    
1641
        for(y=0; y<h; y++){
1642
            if(s->c.bytestream_end - s->c.bytestream < w*40){
1643
                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1644
                return -1;
1645
            }
1646
            for(x=0; x<w; x++){
1647
                int v, p=0;
1648
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1649
                v= src[x + y*stride];
1650

    
1651
                if(y){
1652
                    t= src[x + (y-1)*stride];
1653
                    if(x){
1654
                        lt= src[x - 1 + (y-1)*stride];
1655
                    }
1656
                    if(x + 1 < w){
1657
                        rt= src[x + 1 + (y-1)*stride];
1658
                    }
1659
                }
1660
                if(x){
1661
                    l= src[x - 1 + y*stride];
1662
                    /*if(x > 1){
1663
                        if(orientation==1) ll= src[y + (x-2)*stride];
1664
                        else               ll= src[x - 2 + y*stride];
1665
                    }*/
1666
                }
1667
                if(parent){
1668
                    int px= x>>1;
1669
                    int py= y>>1;
1670
                    if(px<b->parent->width && py<b->parent->height)
1671
                        p= parent[px + py*2*stride];
1672
                }
1673
                if(/*ll|*/l|lt|t|rt|p){
1674
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1675

    
1676
                    put_rac(&s->c, &b->state[0][context], !!v);
1677
                }else{
1678
                    if(!run){
1679
                        run= runs[run_index++];
1680

    
1681
                        if(run_index <= max_index)
1682
                            put_symbol2(&s->c, b->state[1], run, 3);
1683
                        assert(v);
1684
                    }else{
1685
                        run--;
1686
                        assert(!v);
1687
                    }
1688
                }
1689
                if(v){
1690
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1691
                    int l2= 2*FFABS(l) + (l<0);
1692
                    int t2= 2*FFABS(t) + (t<0);
1693

    
1694
                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1695
                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1696
                }
1697
            }
1698
        }
1699
    }
1700
    return 0;
1701
}
1702

    
1703
static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1704
//    encode_subband_qtree(s, b, src, parent, stride, orientation);
1705
//    encode_subband_z0run(s, b, src, parent, stride, orientation);
1706
    return encode_subband_c0run(s, b, src, parent, stride, orientation);
1707
//    encode_subband_dzr(s, b, src, parent, stride, orientation);
1708
}
1709

    
1710
static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1711
    const int w= b->width;
1712
    const int h= b->height;
1713
    int x,y;
1714

    
1715
    if(1){
1716
        int run, runs;
1717
        x_and_coeff *xc= b->x_coeff;
1718
        x_and_coeff *prev_xc= NULL;
1719
        x_and_coeff *prev2_xc= xc;
1720
        x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1721
        x_and_coeff *prev_parent_xc= parent_xc;
1722

    
1723
        runs= get_symbol2(&s->c, b->state[30], 0);
1724
        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1725
        else           run= INT_MAX;
1726

    
1727
        for(y=0; y<h; y++){
1728
            int v=0;
1729
            int lt=0, t=0, rt=0;
1730

    
1731
            if(y && prev_xc->x == 0){
1732
                rt= prev_xc->coeff;
1733
            }
1734
            for(x=0; x<w; x++){
1735
                int p=0;
1736
                const int l= v;
1737

    
1738
                lt= t; t= rt;
1739

    
1740
                if(y){
1741
                    if(prev_xc->x <= x)
1742
                        prev_xc++;
1743
                    if(prev_xc->x == x + 1)
1744
                        rt= prev_xc->coeff;
1745
                    else
1746
                        rt=0;
1747
                }
1748
                if(parent_xc){
1749
                    if(x>>1 > parent_xc->x){
1750
                        parent_xc++;
1751
                    }
1752
                    if(x>>1 == parent_xc->x){
1753
                        p= parent_xc->coeff;
1754
                    }
1755
                }
1756
                if(/*ll|*/l|lt|t|rt|p){
1757
                    int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1758

    
1759
                    v=get_rac(&s->c, &b->state[0][context]);
1760
                    if(v){
1761
                        v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1762
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1763

    
1764
                        xc->x=x;
1765
                        (xc++)->coeff= v;
1766
                    }
1767
                }else{
1768
                    if(!run){
1769
                        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1770
                        else           run= INT_MAX;
1771
                        v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1772
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1773

    
1774
                        xc->x=x;
1775
                        (xc++)->coeff= v;
1776
                    }else{
1777
                        int max_run;
1778
                        run--;
1779
                        v=0;
1780

    
1781
                        if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1782
                        else  max_run= FFMIN(run, w-x-1);
1783
                        if(parent_xc)
1784
                            max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1785
                        x+= max_run;
1786
                        run-= max_run;
1787
                    }
1788
                }
1789
            }
1790
            (xc++)->x= w+1; //end marker
1791
            prev_xc= prev2_xc;
1792
            prev2_xc= xc;
1793

    
1794
            if(parent_xc){
1795
                if(y&1){
1796
                    while(parent_xc->x != parent->width+1)
1797
                        parent_xc++;
1798
                    parent_xc++;
1799
                    prev_parent_xc= parent_xc;
1800
                }else{
1801
                    parent_xc= prev_parent_xc;
1802
                }
1803
            }
1804
        }
1805

    
1806
        (xc++)->x= w+1; //end marker
1807
    }
1808
}
1809

    
1810
static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1811
    const int w= b->width;
1812
    int y;
1813
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1814
    int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1815
    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1816
    int new_index = 0;
1817

    
1818
    START_TIMER
1819

    
1820
    if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1821
        qadd= 0;
1822
        qmul= 1<<QEXPSHIFT;
1823
    }
1824

    
1825
    /* If we are on the second or later slice, restore our index. */
1826
    if (start_y != 0)
1827
        new_index = save_state[0];
1828

    
1829

    
1830
    for(y=start_y; y<h; y++){
1831
        int x = 0;
1832
        int v;
1833
        DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1834
        memset(line, 0, b->width*sizeof(DWTELEM));
1835
        v = b->x_coeff[new_index].coeff;
1836
        x = b->x_coeff[new_index++].x;
1837
        while(x < w)
1838
        {
1839
            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1840
            register int u= -(v&1);
1841
            line[x] = (t^u) - u;
1842

    
1843
            v = b->x_coeff[new_index].coeff;
1844
            x = b->x_coeff[new_index++].x;
1845
        }
1846
    }
1847
    if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1848
        STOP_TIMER("decode_subband")
1849
    }
1850

    
1851
    /* Save our variables for the next slice. */
1852
    save_state[0] = new_index;
1853

    
1854
    return;
1855
}
1856

    
1857
static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1858
    int plane_index, level, orientation;
1859

    
1860
    for(plane_index=0; plane_index<3; plane_index++){
1861
        for(level=0; level<s->spatial_decomposition_count; level++){
1862
            for(orientation=level ? 1:0; orientation<4; orientation++){
1863
                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1864
            }
1865
        }
1866
    }
1867
    memset(s->header_state, MID_STATE, sizeof(s->header_state));
1868
    memset(s->block_state, MID_STATE, sizeof(s->block_state));
1869
}
1870

    
1871
static int alloc_blocks(SnowContext *s){
1872
    int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1873
    int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1874

    
1875
    s->b_width = w;
1876
    s->b_height= h;
1877

    
1878
    s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1879
    return 0;
1880
}
1881

    
1882
static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1883
    uint8_t *bytestream= d->bytestream;
1884
    uint8_t *bytestream_start= d->bytestream_start;
1885
    *d= *s;
1886
    d->bytestream= bytestream;
1887
    d->bytestream_start= bytestream_start;
1888
}
1889

    
1890
//near copy & paste from dsputil, FIXME
1891
static int pix_sum(uint8_t * pix, int line_size, int w)
1892
{
1893
    int s, i, j;
1894

    
1895
    s = 0;
1896
    for (i = 0; i < w; i++) {
1897
        for (j = 0; j < w; j++) {
1898
            s += pix[0];
1899
            pix ++;
1900
        }
1901
        pix += line_size - w;
1902
    }
1903
    return s;
1904
}
1905

    
1906
//near copy & paste from dsputil, FIXME
1907
static int pix_norm1(uint8_t * pix, int line_size, int w)
1908
{
1909
    int s, i, j;
1910
    uint32_t *sq = ff_squareTbl + 256;
1911

    
1912
    s = 0;
1913
    for (i = 0; i < w; i++) {
1914
        for (j = 0; j < w; j ++) {
1915
            s += sq[pix[0]];
1916
            pix ++;
1917
        }
1918
        pix += line_size - w;
1919
    }
1920
    return s;
1921
}
1922

    
1923
static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1924
    const int w= s->b_width << s->block_max_depth;
1925
    const int rem_depth= s->block_max_depth - level;
1926
    const int index= (x + y*w) << rem_depth;
1927
    const int block_w= 1<<rem_depth;
1928
    BlockNode block;
1929
    int i,j;
1930

    
1931
    block.color[0]= l;
1932
    block.color[1]= cb;
1933
    block.color[2]= cr;
1934
    block.mx= mx;
1935
    block.my= my;
1936
    block.ref= ref;
1937
    block.type= type;
1938
    block.level= level;
1939

    
1940
    for(j=0; j<block_w; j++){
1941
        for(i=0; i<block_w; i++){
1942
            s->block[index + i + j*w]= block;
1943
        }
1944
    }
1945
}
1946

    
1947
static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1948
    const int offset[3]= {
1949
          y*c->  stride + x,
1950
        ((y*c->uvstride + x)>>1),
1951
        ((y*c->uvstride + x)>>1),
1952
    };
1953
    int i;
1954
    for(i=0; i<3; i++){
1955
        c->src[0][i]= src [i];
1956
        c->ref[0][i]= ref [i] + offset[i];
1957
    }
1958
    assert(!ref_index);
1959
}
1960

    
1961
static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1962
                           const BlockNode *left, const BlockNode *top, const BlockNode *tr){
1963
    if(s->ref_frames == 1){
1964
        *mx = mid_pred(left->mx, top->mx, tr->mx);
1965
        *my = mid_pred(left->my, top->my, tr->my);
1966
    }else{
1967
        const int *scale = scale_mv_ref[ref];
1968
        *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
1969
                       (top ->mx * scale[top ->ref] + 128) >>8,
1970
                       (tr  ->mx * scale[tr  ->ref] + 128) >>8);
1971
        *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
1972
                       (top ->my * scale[top ->ref] + 128) >>8,
1973
                       (tr  ->my * scale[tr  ->ref] + 128) >>8);
1974
    }
1975
}
1976

    
1977
//FIXME copy&paste
1978
#define P_LEFT P[1]
1979
#define P_TOP P[2]
1980
#define P_TOPRIGHT P[3]
1981
#define P_MEDIAN P[4]
1982
#define P_MV1 P[9]
1983
#define FLAG_QPEL   1 //must be 1
1984

    
1985
static int encode_q_branch(SnowContext *s, int level, int x, int y){
1986
    uint8_t p_buffer[1024];
1987
    uint8_t i_buffer[1024];
1988
    uint8_t p_state[sizeof(s->block_state)];
1989
    uint8_t i_state[sizeof(s->block_state)];
1990
    RangeCoder pc, ic;
1991
    uint8_t *pbbak= s->c.bytestream;
1992
    uint8_t *pbbak_start= s->c.bytestream_start;
1993
    int score, score2, iscore, i_len, p_len, block_s, sum;
1994
    const int w= s->b_width  << s->block_max_depth;
1995
    const int h= s->b_height << s->block_max_depth;
1996
    const int rem_depth= s->block_max_depth - level;
1997
    const int index= (x + y*w) << rem_depth;
1998
    const int block_w= 1<<(LOG2_MB_SIZE - level);
1999
    int trx= (x+1)<<rem_depth;
2000
    int try= (y+1)<<rem_depth;
2001
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2002
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2003
    const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
2004
    const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2005
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2006
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2007
    int pl = left->color[0];
2008
    int pcb= left->color[1];
2009
    int pcr= left->color[2];
2010
    int pmx, pmy;
2011
    int mx=0, my=0;
2012
    int l,cr,cb;
2013
    const int stride= s->current_picture.linesize[0];
2014
    const int uvstride= s->current_picture.linesize[1];
2015
    uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2016
                                s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2017
                                s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2018
    int P[10][2];
2019
    int16_t last_mv[3][2];
2020
    int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2021
    const int shift= 1+qpel;
2022
    MotionEstContext *c= &s->m.me;
2023
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2024
    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2025
    int my_context= av_log2(2*FFABS(left->my - top->my));
2026
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2027
    int ref, best_ref, ref_score, ref_mx, ref_my;
2028

    
2029
    assert(sizeof(s->block_state) >= 256);
2030
    if(s->keyframe){
2031
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2032
        return 0;
2033
    }
2034

    
2035
//    clip predictors / edge ?
2036

    
2037
    P_LEFT[0]= left->mx;
2038
    P_LEFT[1]= left->my;
2039
    P_TOP [0]= top->mx;
2040
    P_TOP [1]= top->my;
2041
    P_TOPRIGHT[0]= tr->mx;
2042
    P_TOPRIGHT[1]= tr->my;
2043

    
2044
    last_mv[0][0]= s->block[index].mx;
2045
    last_mv[0][1]= s->block[index].my;
2046
    last_mv[1][0]= right->mx;
2047
    last_mv[1][1]= right->my;
2048
    last_mv[2][0]= bottom->mx;
2049
    last_mv[2][1]= bottom->my;
2050

    
2051
    s->m.mb_stride=2;
2052
    s->m.mb_x=
2053
    s->m.mb_y= 0;
2054
    c->skip= 0;
2055

    
2056
    assert(c->  stride ==   stride);
2057
    assert(c->uvstride == uvstride);
2058

    
2059
    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2060
    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2061
    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2062
    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2063

    
2064
    c->xmin = - x*block_w - 16+2;
2065
    c->ymin = - y*block_w - 16+2;
2066
    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2067
    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2068

    
2069
    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2070
    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2071
    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2072
    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2073
    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2074
    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2075
    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2076

    
2077
    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2078
    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2079

    
2080
    if (!y) {
2081
        c->pred_x= P_LEFT[0];
2082
        c->pred_y= P_LEFT[1];
2083
    } else {
2084
        c->pred_x = P_MEDIAN[0];
2085
        c->pred_y = P_MEDIAN[1];
2086
    }
2087

    
2088
    score= INT_MAX;
2089
    best_ref= 0;
2090
    for(ref=0; ref<s->ref_frames; ref++){
2091
        init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2092

    
2093
        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2094
                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2095

    
2096
        assert(ref_mx >= c->xmin);
2097
        assert(ref_mx <= c->xmax);
2098
        assert(ref_my >= c->ymin);
2099
        assert(ref_my <= c->ymax);
2100

    
2101
        ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2102
        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2103
        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2104
        if(s->ref_mvs[ref]){
2105
            s->ref_mvs[ref][index][0]= ref_mx;
2106
            s->ref_mvs[ref][index][1]= ref_my;
2107
            s->ref_scores[ref][index]= ref_score;
2108
        }
2109
        if(score > ref_score){
2110
            score= ref_score;
2111
            best_ref= ref;
2112
            mx= ref_mx;
2113
            my= ref_my;
2114
        }
2115
    }
2116
    //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
2117

    
2118
  //  subpel search
2119
    pc= s->c;
2120
    pc.bytestream_start=
2121
    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2122
    memcpy(p_state, s->block_state, sizeof(s->block_state));
2123

    
2124
    if(level!=s->block_max_depth)
2125
        put_rac(&pc, &p_state[4 + s_context], 1);
2126
    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2127
    if(s->ref_frames > 1)
2128
        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2129
    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
2130
    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2131
    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2132
    p_len= pc.bytestream - pc.bytestream_start;
2133
    score += (s->lambda2*(p_len*8
2134
              + (pc.outstanding_count - s->c.outstanding_count)*8
2135
              + (-av_log2(pc.range)    + av_log2(s->c.range))
2136
             ))>>FF_LAMBDA_SHIFT;
2137

    
2138
    block_s= block_w*block_w;
2139
    sum = pix_sum(current_data[0], stride, block_w);
2140
    l= (sum + block_s/2)/block_s;
2141
    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2142

    
2143
    block_s= block_w*block_w>>2;
2144
    sum = pix_sum(current_data[1], uvstride, block_w>>1);
2145
    cb= (sum + block_s/2)/block_s;
2146
//    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2147
    sum = pix_sum(current_data[2], uvstride, block_w>>1);
2148
    cr= (sum + block_s/2)/block_s;
2149
//    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2150

    
2151
    ic= s->c;
2152
    ic.bytestream_start=
2153
    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2154
    memcpy(i_state, s->block_state, sizeof(s->block_state));
2155
    if(level!=s->block_max_depth)
2156
        put_rac(&ic, &i_state[4 + s_context], 1);
2157
    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2158
    put_symbol(&ic, &i_state[32],  l-pl , 1);
2159
    put_symbol(&ic, &i_state[64], cb-pcb, 1);
2160
    put_symbol(&ic, &i_state[96], cr-pcr, 1);
2161
    i_len= ic.bytestream - ic.bytestream_start;
2162
    iscore += (s->lambda2*(i_len*8
2163
              + (ic.outstanding_count - s->c.outstanding_count)*8
2164
              + (-av_log2(ic.range)    + av_log2(s->c.range))
2165
             ))>>FF_LAMBDA_SHIFT;
2166

    
2167
//    assert(score==256*256*256*64-1);
2168
    assert(iscore < 255*255*256 + s->lambda2*10);
2169
    assert(iscore >= 0);
2170
    assert(l>=0 && l<=255);
2171
    assert(pl>=0 && pl<=255);
2172

    
2173
    if(level==0){
2174
        int varc= iscore >> 8;
2175
        int vard= score >> 8;
2176
        if (vard <= 64 || vard < varc)
2177
            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2178
        else
2179
            c->scene_change_score+= s->m.qscale;
2180
    }
2181

    
2182
    if(level!=s->block_max_depth){
2183
        put_rac(&s->c, &s->block_state[4 + s_context], 0);
2184
        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2185
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2186
        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2187
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2188
        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2189

    
2190
        if(score2 < score && score2 < iscore)
2191
            return score2;
2192
    }
2193

    
2194
    if(iscore < score){
2195
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2196
        memcpy(pbbak, i_buffer, i_len);
2197
        s->c= ic;
2198
        s->c.bytestream_start= pbbak_start;
2199
        s->c.bytestream= pbbak + i_len;
2200
        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2201
        memcpy(s->block_state, i_state, sizeof(s->block_state));
2202
        return iscore;
2203
    }else{
2204
        memcpy(pbbak, p_buffer, p_len);
2205
        s->c= pc;
2206
        s->c.bytestream_start= pbbak_start;
2207
        s->c.bytestream= pbbak + p_len;
2208
        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2209
        memcpy(s->block_state, p_state, sizeof(s->block_state));
2210
        return score;
2211
    }
2212
}
2213

    
2214
static av_always_inline int same_block(BlockNode *a, BlockNode *b){
2215
    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2216
        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2217
    }else{
2218
        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2219
    }
2220
}
2221

    
2222
static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2223
    const int w= s->b_width  << s->block_max_depth;
2224
    const int rem_depth= s->block_max_depth - level;
2225
    const int index= (x + y*w) << rem_depth;
2226
    int trx= (x+1)<<rem_depth;
2227
    BlockNode *b= &s->block[index];
2228
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2229
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2230
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2231
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2232
    int pl = left->color[0];
2233
    int pcb= left->color[1];
2234
    int pcr= left->color[2];
2235
    int pmx, pmy;
2236
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2237
    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2238
    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2239
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2240

    
2241
    if(s->keyframe){
2242
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2243
        return;
2244
    }
2245

    
2246
    if(level!=s->block_max_depth){
2247
        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2248
            put_rac(&s->c, &s->block_state[4 + s_context], 1);
2249
        }else{
2250
            put_rac(&s->c, &s->block_state[4 + s_context], 0);
2251
            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2252
            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2253
            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2254
            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2255
            return;
2256
        }
2257
    }
2258
    if(b->type & BLOCK_INTRA){
2259
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2260
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2261
        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2262
        put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2263
        put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2264
        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2265
    }else{
2266
        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2267
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2268
        if(s->ref_frames > 1)
2269
            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2270
        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2271
        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2272
        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2273
    }
2274
}
2275

    
2276
static void decode_q_branch(SnowContext *s, int level, int x, int y){
2277
    const int w= s->b_width << s->block_max_depth;
2278
    const int rem_depth= s->block_max_depth - level;
2279
    const int index= (x + y*w) << rem_depth;
2280
    int trx= (x+1)<<rem_depth;
2281
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2282
    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2283
    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2284
    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2285
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2286

    
2287
    if(s->keyframe){
2288
        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2289
        return;
2290
    }
2291

    
2292
    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2293
        int type, mx, my;
2294
        int l = left->color[0];
2295
        int cb= left->color[1];
2296
        int cr= left->color[2];
2297
        int ref = 0;
2298
        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2299
        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2300
        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2301

    
2302
        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2303

    
2304
        if(type){
2305
            pred_mv(s, &mx, &my, 0, left, top, tr);
2306
            l += get_symbol(&s->c, &s->block_state[32], 1);
2307
            cb+= get_symbol(&s->c, &s->block_state[64], 1);
2308
            cr+= get_symbol(&s->c, &s->block_state[96], 1);
2309
        }else{
2310
            if(s->ref_frames > 1)
2311
                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2312
            pred_mv(s, &mx, &my, ref, left, top, tr);
2313
            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2314
            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2315
        }
2316
        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2317
    }else{
2318
        decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2319
        decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2320
        decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2321
        decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2322
    }
2323
}
2324

    
2325
static void encode_blocks(SnowContext *s, int search){
2326
    int x, y;
2327
    int w= s->b_width;
2328
    int h= s->b_height;
2329

    
2330
    if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2331
        iterative_me(s);
2332

    
2333
    for(y=0; y<h; y++){
2334
        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2335
            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2336
            return;
2337
        }
2338
        for(x=0; x<w; x++){
2339
            if(s->avctx->me_method == ME_ITER || !search)
2340
                encode_q_branch2(s, 0, x, y);
2341
            else
2342
                encode_q_branch (s, 0, x, y);
2343
        }
2344
    }
2345
}
2346

    
2347
static void decode_blocks(SnowContext *s){
2348
    int x, y;
2349
    int w= s->b_width;
2350
    int h= s->b_height;
2351

    
2352
    for(y=0; y<h; y++){
2353
        for(x=0; x<w; x++){
2354
            decode_q_branch(s, 0, x, y);
2355
        }
2356
    }
2357
}
2358

    
2359
static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2360
    int x, y;
2361
START_TIMER
2362
    for(y=0; y < b_h+5; y++){
2363
        for(x=0; x < b_w; x++){
2364
            int a0= src[x    ];
2365
            int a1= src[x + 1];
2366
            int a2= src[x + 2];
2367
            int a3= src[x + 3];
2368
            int a4= src[x + 4];
2369
            int a5= src[x + 5];
2370
//            int am= 9*(a1+a2) - (a0+a3);
2371
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2372
//            int am= 18*(a2+a3) - 2*(a1+a4);
2373
//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2374
//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2375

    
2376
//            if(b_w==16) am= 8*(a1+a2);
2377

    
2378
            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2379
            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2380

    
2381
            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2382
            if(am&(~255)) am= ~(am>>31);
2383

    
2384
            tmp[x] = am;
2385

    
2386
/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2387
            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2388
            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2389
            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2390
        }
2391
        tmp += stride;
2392
        src += stride;
2393
    }
2394
    tmp -= (b_h+5)*stride;
2395

    
2396
    for(y=0; y < b_h; y++){
2397
        for(x=0; x < b_w; x++){
2398
            int a0= tmp[x + 0*stride];
2399
            int a1= tmp[x + 1*stride];
2400
            int a2= tmp[x + 2*stride];
2401
            int a3= tmp[x + 3*stride];
2402
            int a4= tmp[x + 4*stride];
2403
            int a5= tmp[x + 5*stride];
2404
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2405
//            int am= 18*(a2+a3) - 2*(a1+a4);
2406
/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2407
            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2408

    
2409
//            if(b_w==16) am= 8*(a1+a2);
2410

    
2411
            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2412
            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2413

    
2414
            if(am&(~255)) am= ~(am>>31);
2415

    
2416
            dst[x] = am;
2417
/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2418
            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2419
            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2420
            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2421
        }
2422
        dst += stride;
2423
        tmp += stride;
2424
    }
2425
STOP_TIMER("mc_block")
2426
}
2427

    
2428
#define mca(dx,dy,b_w)\
2429
static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2430
    uint8_t tmp[stride*(b_w+5)];\
2431
    assert(h==b_w);\
2432
    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2433
}
2434

    
2435
mca( 0, 0,16)
2436
mca( 8, 0,16)
2437
mca( 0, 8,16)
2438
mca( 8, 8,16)
2439
mca( 0, 0,8)
2440
mca( 8, 0,8)
2441
mca( 0, 8,8)
2442
mca( 8, 8,8)
2443

    
2444
static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2445
    if(block->type & BLOCK_INTRA){
2446
        int x, y;
2447
        const int color = block->color[plane_index];
2448
        const int color4= color*0x01010101;
2449
        if(b_w==32){
2450
            for(y=0; y < b_h; y++){
2451
                *(uint32_t*)&dst[0 + y*stride]= color4;
2452
                *(uint32_t*)&dst[4 + y*stride]= color4;
2453
                *(uint32_t*)&dst[8 + y*stride]= color4;
2454
                *(uint32_t*)&dst[12+ y*stride]= color4;
2455
                *(uint32_t*)&dst[16+ y*stride]= color4;
2456
                *(uint32_t*)&dst[20+ y*stride]= color4;
2457
                *(uint32_t*)&dst[24+ y*stride]= color4;
2458
                *(uint32_t*)&dst[28+ y*stride]= color4;
2459
            }
2460
        }else if(b_w==16){
2461
            for(y=0; y < b_h; y++){
2462
                *(uint32_t*)&dst[0 + y*stride]= color4;
2463
                *(uint32_t*)&dst[4 + y*stride]= color4;
2464
                *(uint32_t*)&dst[8 + y*stride]= color4;
2465
                *(uint32_t*)&dst[12+ y*stride]= color4;
2466
            }
2467
        }else if(b_w==8){
2468
            for(y=0; y < b_h; y++){
2469
                *(uint32_t*)&dst[0 + y*stride]= color4;
2470
                *(uint32_t*)&dst[4 + y*stride]= color4;
2471
            }
2472
        }else if(b_w==4){
2473
            for(y=0; y < b_h; y++){
2474
                *(uint32_t*)&dst[0 + y*stride]= color4;
2475
            }
2476
        }else{
2477
            for(y=0; y < b_h; y++){
2478
                for(x=0; x < b_w; x++){
2479
                    dst[x + y*stride]= color;
2480
                }
2481
            }
2482
        }
2483
    }else{
2484
        uint8_t *src= s->last_picture[block->ref].data[plane_index];
2485
        const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2486
        int mx= block->mx*scale;
2487
        int my= block->my*scale;
2488
        const int dx= mx&15;
2489
        const int dy= my&15;
2490
        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2491
        sx += (mx>>4) - 2;
2492
        sy += (my>>4) - 2;
2493
        src += sx + sy*stride;
2494
        if(   (unsigned)sx >= w - b_w - 4
2495
           || (unsigned)sy >= h - b_h - 4){
2496
            ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2497
            src= tmp + MB_SIZE;
2498
        }
2499
//        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2500
//        assert(!(b_w&(b_w-1)));
2501
        assert(b_w>1 && b_h>1);
2502
        assert(tab_index>=0 && tab_index<4 || b_w==32);
2503
        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2504
            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2505
        else if(b_w==32){
2506
            int y;
2507
            for(y=0; y<b_h; y+=16){
2508
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2509
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2510
            }
2511
        }else if(b_w==b_h)
2512
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2513
        else if(b_w==2*b_h){
2514
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2515
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2516
        }else{
2517
            assert(2*b_w==b_h);
2518
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2519
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2520
        }
2521
    }
2522
}
2523

    
2524
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2525
                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2526
    int y, x;
2527
    DWTELEM * dst;
2528
    for(y=0; y<b_h; y++){
2529
        //FIXME ugly missue of obmc_stride
2530
        const uint8_t *obmc1= obmc + y*obmc_stride;
2531
        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2532
        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2533
        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2534
        dst = slice_buffer_get_line(sb, src_y + y);
2535
        for(x=0; x<b_w; x++){
2536
            int v=   obmc1[x] * block[3][x + y*src_stride]
2537
                    +obmc2[x] * block[2][x + y*src_stride]
2538
                    +obmc3[x] * block[1][x + y*src_stride]
2539
                    +obmc4[x] * block[0][x + y*src_stride];
2540

    
2541
            v <<= 8 - LOG2_OBMC_MAX;
2542
            if(FRAC_BITS != 8){
2543
                v += 1<<(7 - FRAC_BITS);
2544
                v >>= 8 - FRAC_BITS;
2545
            }
2546
            if(add){
2547
                v += dst[x + src_x];
2548
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2549
                if(v&(~255)) v= ~(v>>31);
2550
                dst8[x + y*src_stride] = v;
2551
            }else{
2552
                dst[x + src_x] -= v;
2553
            }
2554
        }
2555
    }
2556
}
2557

    
2558
//FIXME name clenup (b_w, block_w, b_width stuff)
2559
static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2560
    const int b_width = s->b_width  << s->block_max_depth;
2561
    const int b_height= s->b_height << s->block_max_depth;
2562
    const int b_stride= b_width;
2563
    BlockNode *lt= &s->block[b_x + b_y*b_stride];
2564
    BlockNode *rt= lt+1;
2565
    BlockNode *lb= lt+b_stride;
2566
    BlockNode *rb= lb+1;
2567
    uint8_t *block[4];
2568
    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2569
    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2570
    uint8_t *ptmp;
2571
    int x,y;
2572

    
2573
    if(b_x<0){
2574
        lt= rt;
2575
        lb= rb;
2576
    }else if(b_x + 1 >= b_width){
2577
        rt= lt;
2578
        rb= lb;
2579
    }
2580
    if(b_y<0){
2581
        lt= lb;
2582
        rt= rb;
2583
    }else if(b_y + 1 >= b_height){
2584
        lb= lt;
2585
        rb= rt;
2586
    }
2587

    
2588
    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2589
        obmc -= src_x;
2590
        b_w += src_x;
2591
        if(!sliced && !offset_dst)
2592
            dst -= src_x;
2593
        src_x=0;
2594
    }else if(src_x + b_w > w){
2595
        b_w = w - src_x;
2596
    }
2597
    if(src_y<0){
2598
        obmc -= src_y*obmc_stride;
2599
        b_h += src_y;
2600
        if(!sliced && !offset_dst)
2601
            dst -= src_y*dst_stride;
2602
        src_y=0;
2603
    }else if(src_y + b_h> h){
2604
        b_h = h - src_y;
2605
    }
2606

    
2607
    if(b_w<=0 || b_h<=0) return;
2608

    
2609
assert(src_stride > 2*MB_SIZE + 5);
2610
    if(!sliced && offset_dst)
2611
        dst += src_x + src_y*dst_stride;
2612
    dst8+= src_x + src_y*src_stride;
2613
//    src += src_x + src_y*src_stride;
2614

    
2615
    ptmp= tmp + 3*tmp_step;
2616
    block[0]= ptmp;
2617
    ptmp+=tmp_step;
2618
    pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2619

    
2620
    if(same_block(lt, rt)){
2621
        block[1]= block[0];
2622
    }else{
2623
        block[1]= ptmp;
2624
        ptmp+=tmp_step;
2625
        pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2626
    }
2627

    
2628
    if(same_block(lt, lb)){
2629
        block[2]= block[0];
2630
    }else if(same_block(rt, lb)){
2631
        block[2]= block[1];
2632
    }else{
2633
        block[2]= ptmp;
2634
        ptmp+=tmp_step;
2635
        pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2636
    }
2637

    
2638
    if(same_block(lt, rb) ){
2639
        block[3]= block[0];
2640
    }else if(same_block(rt, rb)){
2641
        block[3]= block[1];
2642
    }else if(same_block(lb, rb)){
2643
        block[3]= block[2];
2644
    }else{
2645
        block[3]= ptmp;
2646
        pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2647
    }
2648
#if 0
2649
    for(y=0; y<b_h; y++){
2650
        for(x=0; x<b_w; x++){
2651
            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2652
            if(add) dst[x + y*dst_stride] += v;
2653
            else    dst[x + y*dst_stride] -= v;
2654
        }
2655
    }
2656
    for(y=0; y<b_h; y++){
2657
        uint8_t *obmc2= obmc + (obmc_stride>>1);
2658
        for(x=0; x<b_w; x++){
2659
            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2660
            if(add) dst[x + y*dst_stride] += v;
2661
            else    dst[x + y*dst_stride] -= v;
2662
        }
2663
    }
2664
    for(y=0; y<b_h; y++){
2665
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2666
        for(x=0; x<b_w; x++){
2667
            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2668
            if(add) dst[x + y*dst_stride] += v;
2669
            else    dst[x + y*dst_stride] -= v;
2670
        }
2671
    }
2672
    for(y=0; y<b_h; y++){
2673
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2674
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2675
        for(x=0; x<b_w; x++){
2676
            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2677
            if(add) dst[x + y*dst_stride] += v;
2678
            else    dst[x + y*dst_stride] -= v;
2679
        }
2680
    }
2681
#else
2682
    if(sliced){
2683
        START_TIMER
2684

    
2685
        s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2686
        STOP_TIMER("inner_add_yblock")
2687
    }else
2688
    for(y=0; y<b_h; y++){
2689
        //FIXME ugly missue of obmc_stride
2690
        const uint8_t *obmc1= obmc + y*obmc_stride;
2691
        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2692
        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2693
        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2694
        for(x=0; x<b_w; x++){
2695
            int v=   obmc1[x] * block[3][x + y*src_stride]
2696
                    +obmc2[x] * block[2][x + y*src_stride]
2697
                    +obmc3[x] * block[1][x + y*src_stride]
2698
                    +obmc4[x] * block[0][x + y*src_stride];
2699

    
2700
            v <<= 8 - LOG2_OBMC_MAX;
2701
            if(FRAC_BITS != 8){
2702
                v += 1<<(7 - FRAC_BITS);
2703
                v >>= 8 - FRAC_BITS;
2704
            }
2705
            if(add){
2706
                v += dst[x + y*dst_stride];
2707
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2708
                if(v&(~255)) v= ~(v>>31);
2709
                dst8[x + y*src_stride] = v;
2710
            }else{
2711
                dst[x + y*dst_stride] -= v;
2712
            }
2713
        }
2714
    }
2715
#endif
2716
}
2717

    
2718
static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2719
    Plane *p= &s->plane[plane_index];
2720
    const int mb_w= s->b_width  << s->block_max_depth;
2721
    const int mb_h= s->b_height << s->block_max_depth;
2722
    int x, y, mb_x;
2723
    int block_size = MB_SIZE >> s->block_max_depth;
2724
    int block_w    = plane_index ? block_size/2 : block_size;
2725
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2726
    int obmc_stride= plane_index ? block_size : 2*block_size;
2727
    int ref_stride= s->current_picture.linesize[plane_index];
2728
    uint8_t *dst8= s->current_picture.data[plane_index];
2729
    int w= p->width;
2730
    int h= p->height;
2731
    START_TIMER
2732

    
2733
    if(s->keyframe || (s->avctx->debug&512)){
2734
        if(mb_y==mb_h)
2735
            return;
2736

    
2737
        if(add){
2738
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2739
            {
2740
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2741
                DWTELEM * line = sb->line[y];
2742
                for(x=0; x<w; x++)
2743
                {
2744
//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2745
                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2746
                    v >>= FRAC_BITS;
2747
                    if(v&(~255)) v= ~(v>>31);
2748
                    dst8[x + y*ref_stride]= v;
2749
                }
2750
            }
2751
        }else{
2752
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2753
            {
2754
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2755
                DWTELEM * line = sb->line[y];
2756
                for(x=0; x<w; x++)
2757
                {
2758
                    line[x] -= 128 << FRAC_BITS;
2759
//                    buf[x + y*w]-= 128<<FRAC_BITS;
2760
                }
2761
            }
2762
        }
2763

    
2764
        return;
2765
    }
2766

    
2767
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2768
            START_TIMER
2769

    
2770
            add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2771
                       block_w*mb_x - block_w/2,
2772
                       block_w*mb_y - block_w/2,
2773
                       block_w, block_w,
2774
                       w, h,
2775
                       w, ref_stride, obmc_stride,
2776
                       mb_x - 1, mb_y - 1,
2777
                       add, 0, plane_index);
2778

    
2779
            STOP_TIMER("add_yblock")
2780
        }
2781

    
2782
    STOP_TIMER("predict_slice")
2783
}
2784

    
2785
static av_always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2786
    Plane *p= &s->plane[plane_index];
2787
    const int mb_w= s->b_width  << s->block_max_depth;
2788
    const int mb_h= s->b_height << s->block_max_depth;
2789
    int x, y, mb_x;
2790
    int block_size = MB_SIZE >> s->block_max_depth;
2791
    int block_w    = plane_index ? block_size/2 : block_size;
2792
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2793
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2794
    int ref_stride= s->current_picture.linesize[plane_index];
2795
    uint8_t *dst8= s->current_picture.data[plane_index];
2796
    int w= p->width;
2797
    int h= p->height;
2798
    START_TIMER
2799

    
2800
    if(s->keyframe || (s->avctx->debug&512)){
2801
        if(mb_y==mb_h)
2802
            return;
2803

    
2804
        if(add){
2805
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2806
                for(x=0; x<w; x++){
2807
                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2808
                    v >>= FRAC_BITS;
2809
                    if(v&(~255)) v= ~(v>>31);
2810
                    dst8[x + y*ref_stride]= v;
2811
                }
2812
            }
2813
        }else{
2814
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2815
                for(x=0; x<w; x++){
2816
                    buf[x + y*w]-= 128<<FRAC_BITS;
2817
                }
2818
            }
2819
        }
2820

    
2821
        return;
2822
    }
2823

    
2824
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2825
            START_TIMER
2826

    
2827
            add_yblock(s, 0, NULL, buf, dst8, obmc,
2828
                       block_w*mb_x - block_w/2,
2829
                       block_w*mb_y - block_w/2,
2830
                       block_w, block_w,
2831
                       w, h,
2832
                       w, ref_stride, obmc_stride,
2833
                       mb_x - 1, mb_y - 1,
2834
                       add, 1, plane_index);
2835

    
2836
            STOP_TIMER("add_yblock")
2837
        }
2838

    
2839
    STOP_TIMER("predict_slice")
2840
}
2841

    
2842
static av_always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2843
    const int mb_h= s->b_height << s->block_max_depth;
2844
    int mb_y;
2845
    for(mb_y=0; mb_y<=mb_h; mb_y++)
2846
        predict_slice(s, buf, plane_index, add, mb_y);
2847
}
2848

    
2849
static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2850
    int i, x2, y2;
2851
    Plane *p= &s->plane[plane_index];
2852
    const int block_size = MB_SIZE >> s->block_max_depth;
2853
    const int block_w    = plane_index ? block_size/2 : block_size;
2854
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2855
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2856
    const int ref_stride= s->current_picture.linesize[plane_index];
2857
    uint8_t *src= s-> input_picture.data[plane_index];
2858
    DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2859
    const int b_stride = s->b_width << s->block_max_depth;
2860
    const int w= p->width;
2861
    const int h= p->height;
2862
    int index= mb_x + mb_y*b_stride;
2863
    BlockNode *b= &s->block[index];
2864
    BlockNode backup= *b;
2865
    int ab=0;
2866
    int aa=0;
2867

    
2868
    b->type|= BLOCK_INTRA;
2869
    b->color[plane_index]= 0;
2870
    memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2871

    
2872
    for(i=0; i<4; i++){
2873
        int mb_x2= mb_x + (i &1) - 1;
2874
        int mb_y2= mb_y + (i>>1) - 1;
2875
        int x= block_w*mb_x2 + block_w/2;
2876
        int y= block_w*mb_y2 + block_w/2;
2877

    
2878
        add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2879
                    x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2880

    
2881
        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2882
            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2883
                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2884
                int obmc_v= obmc[index];
2885
                int d;
2886
                if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2887
                if(x<0) obmc_v += obmc[index + block_w];
2888
                if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2889
                if(x+block_w>w) obmc_v += obmc[index - block_w];
2890
                //FIXME precalc this or simplify it somehow else
2891

    
2892
                d = -dst[index] + (1<<(FRAC_BITS-1));
2893
                dst[index] = d;
2894
                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2895
                aa += obmc_v * obmc_v; //FIXME precalclate this
2896
            }
2897
        }
2898
    }
2899
    *b= backup;
2900

    
2901
    return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
2902
}
2903

    
2904
static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2905
    const int b_stride = s->b_width << s->block_max_depth;
2906
    const int b_height = s->b_height<< s->block_max_depth;
2907
    int index= x + y*b_stride;
2908
    const BlockNode *b     = &s->block[index];
2909
    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2910
    const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2911
    const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2912
    const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2913
    int dmx, dmy;
2914
//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2915
//  int my_context= av_log2(2*FFABS(left->my - top->my));
2916

    
2917
    if(x<0 || x>=b_stride || y>=b_height)
2918
        return 0;
2919
/*
2920
1            0      0
2921
01X          1-2    1
2922
001XX        3-6    2-3
2923
0001XXX      7-14   4-7
2924
00001XXXX   15-30   8-15
2925
*/
2926
//FIXME try accurate rate
2927
//FIXME intra and inter predictors if surrounding blocks arent the same type
2928
    if(b->type & BLOCK_INTRA){
2929
        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2930
                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
2931
                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
2932
    }else{
2933
        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2934
        dmx-= b->mx;
2935
        dmy-= b->my;
2936
        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2937
                    + av_log2(2*FFABS(dmy))
2938
                    + av_log2(2*b->ref));
2939
    }
2940
}
2941

    
2942
static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2943
    Plane *p= &s->plane[plane_index];
2944
    const int block_size = MB_SIZE >> s->block_max_depth;
2945
    const int block_w    = plane_index ? block_size/2 : block_size;
2946
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2947
    const int ref_stride= s->current_picture.linesize[plane_index];
2948
    uint8_t *dst= s->current_picture.data[plane_index];
2949
    uint8_t *src= s->  input_picture.data[plane_index];
2950
    DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2951
    uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2952
    uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
2953
    const int b_stride = s->b_width << s->block_max_depth;
2954
    const int b_height = s->b_height<< s->block_max_depth;
2955
    const int w= p->width;
2956
    const int h= p->height;
2957
    int distortion;
2958
    int rate= 0;
2959
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2960
    int sx= block_w*mb_x - block_w/2;
2961
    int sy= block_w*mb_y - block_w/2;
2962
    int x0= FFMAX(0,-sx);
2963
    int y0= FFMAX(0,-sy);
2964
    int x1= FFMIN(block_w*2, w-sx);
2965
    int y1= FFMIN(block_w*2, h-sy);
2966
    int i,x,y;
2967

    
2968
    pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2969

    
2970
    for(y=y0; y<y1; y++){
2971
        const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2972
        const DWTELEM *pred1 = pred + y*obmc_stride;
2973
        uint8_t *cur1 = cur + y*ref_stride;
2974
        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2975
        for(x=x0; x<x1; x++){
2976
            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2977
            v = (v + pred1[x]) >> FRAC_BITS;
2978
            if(v&(~255)) v= ~(v>>31);
2979
            dst1[x] = v;
2980
        }
2981
    }
2982

    
2983
    /* copy the regions where obmc[] = (uint8_t)256 */
2984
    if(LOG2_OBMC_MAX == 8
2985
        && (mb_x == 0 || mb_x == b_stride-1)
2986
        && (mb_y == 0 || mb_y == b_height-1)){
2987
        if(mb_x == 0)
2988
            x1 = block_w;
2989
        else
2990
            x0 = block_w;
2991
        if(mb_y == 0)
2992
            y1 = block_w;
2993
        else
2994
            y0 = block_w;
2995
        for(y=y0; y<y1; y++)
2996
            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2997
    }
2998

    
2999
    if(block_w==16){
3000
        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
3001
        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
3002
        /* FIXME cmps overlap but don't cover the wavelet's whole support,
3003
         * so improving the score of one block is not strictly guaranteed to
3004
         * improve the score of the whole frame, so iterative motion est
3005
         * doesn't always converge. */
3006
        if(s->avctx->me_cmp == FF_CMP_W97)
3007
            distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3008
        else if(s->avctx->me_cmp == FF_CMP_W53)
3009
            distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3010
        else{
3011
            distortion = 0;
3012
            for(i=0; i<4; i++){
3013
                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3014
                distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3015
            }
3016
        }
3017
    }else{
3018
        assert(block_w==8);
3019
        distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3020
    }
3021

    
3022
    if(plane_index==0){
3023
        for(i=0; i<4; i++){
3024
/* ..RRr
3025
 * .RXx.
3026
 * rxx..
3027
 */
3028
            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3029
        }
3030
        if(mb_x == b_stride-2)
3031
            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3032
    }
3033
    return distortion + rate*penalty_factor;
3034
}
3035

    
3036
static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3037
    int i, y2;
3038
    Plane *p= &s->plane[plane_index];
3039
    const int block_size = MB_SIZE >> s->block_max_depth;
3040
    const int block_w    = plane_index ? block_size/2 : block_size;
3041
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3042
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3043
    const int ref_stride= s->current_picture.linesize[plane_index];
3044
    uint8_t *dst= s->current_picture.data[plane_index];
3045
    uint8_t *src= s-> input_picture.data[plane_index];
3046
    static const DWTELEM zero_dst[4096]; //FIXME
3047
    const int b_stride = s->b_width << s->block_max_depth;
3048
    const int w= p->width;
3049
    const int h= p->height;
3050
    int distortion= 0;
3051
    int rate= 0;
3052
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3053

    
3054
    for(i=0; i<9; i++){
3055
        int mb_x2= mb_x + (i%3) - 1;
3056
        int mb_y2= mb_y + (i/3) - 1;
3057
        int x= block_w*mb_x2 + block_w/2;
3058
        int y= block_w*mb_y2 + block_w/2;
3059

    
3060
        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
3061
                   x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3062

    
3063
        //FIXME find a cleaner/simpler way to skip the outside stuff
3064
        for(y2= y; y2<0; y2++)
3065
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3066
        for(y2= h; y2<y+block_w; y2++)
3067
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3068
        if(x<0){
3069
            for(y2= y; y2<y+block_w; y2++)
3070
                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3071
        }
3072
        if(x+block_w > w){
3073
            for(y2= y; y2<y+block_w; y2++)
3074
                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3075
        }
3076

    
3077
        assert(block_w== 8 || block_w==16);
3078
        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3079
    }
3080

    
3081
    if(plane_index==0){
3082
        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3083
        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3084

    
3085
/* ..RRRr
3086
 * .RXXx.
3087
 * .RXXx.
3088
 * rxxx.
3089
 */
3090
        if(merged)
3091
            rate = get_block_bits(s, mb_x, mb_y, 2);
3092
        for(i=merged?4:0; i<9; i++){
3093
            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3094
            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3095
        }
3096
    }
3097
    return distortion + rate*penalty_factor;
3098
}
3099

    
3100
static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3101
    const int b_stride= s->b_width << s->block_max_depth;
3102
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3103
    BlockNode backup= *block;
3104
    int rd, index, value;
3105

    
3106
    assert(mb_x>=0 && mb_y>=0);
3107
    assert(mb_x<b_stride);
3108

    
3109
    if(intra){
3110
        block->color[0] = p[0];
3111
        block->color[1] = p[1];
3112
        block->color[2] = p[2];
3113
        block->type |= BLOCK_INTRA;
3114
    }else{
3115
        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3116
        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3117
        if(s->me_cache[index] == value)
3118
            return 0;
3119
        s->me_cache[index]= value;
3120

    
3121
        block->mx= p[0];
3122
        block->my= p[1];
3123
        block->type &= ~BLOCK_INTRA;
3124
    }
3125

    
3126
    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3127

    
3128
//FIXME chroma
3129
    if(rd < *best_rd){
3130
        *best_rd= rd;
3131
        return 1;
3132
    }else{
3133
        *block= backup;
3134
        return 0;
3135
    }
3136
}
3137

    
3138
/* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3139
static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3140
    int p[2] = {p0, p1};
3141
    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3142
}
3143

    
3144
static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3145
    const int b_stride= s->b_width << s->block_max_depth;
3146
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3147
    BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3148
    int rd, index, value;
3149

    
3150
    assert(mb_x>=0 && mb_y>=0);
3151
    assert(mb_x<b_stride);
3152
    assert(((mb_x|mb_y)&1) == 0);
3153

    
3154
    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3155
    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3156
    if(s->me_cache[index] == value)
3157
        return 0;
3158
    s->me_cache[index]= value;
3159

    
3160
    block->mx= p0;
3161
    block->my= p1;
3162
    block->ref= ref;
3163
    block->type &= ~BLOCK_INTRA;
3164
    block[1]= block[b_stride]= block[b_stride+1]= *block;
3165

    
3166
    rd= get_4block_rd(s, mb_x, mb_y, 0);
3167

    
3168
//FIXME chroma
3169
    if(rd < *best_rd){
3170
        *best_rd= rd;
3171
        return 1;
3172
    }else{
3173
        block[0]= backup[0];
3174
        block[1]= backup[1];
3175
        block[b_stride]= backup[2];
3176
        block[b_stride+1]= backup[3];
3177
        return 0;
3178
    }
3179
}
3180

    
3181
static void iterative_me(SnowContext *s){
3182
    int pass, mb_x, mb_y;
3183
    const int b_width = s->b_width  << s->block_max_depth;
3184
    const int b_height= s->b_height << s->block_max_depth;
3185
    const int b_stride= b_width;
3186
    int color[3];
3187

    
3188
    {
3189
        RangeCoder r = s->c;
3190
        uint8_t state[sizeof(s->block_state)];
3191
        memcpy(state, s->block_state, sizeof(s->block_state));
3192
        for(mb_y= 0; mb_y<s->b_height; mb_y++)
3193
            for(mb_x= 0; mb_x<s->b_width; mb_x++)
3194
                encode_q_branch(s, 0, mb_x, mb_y);
3195
        s->c = r;
3196
        memcpy(s->block_state, state, sizeof(s->block_state));
3197
    }
3198

    
3199
    for(pass=0; pass<25; pass++){
3200
        int change= 0;
3201

    
3202
        for(mb_y= 0; mb_y<b_height; mb_y++){
3203
            for(mb_x= 0; mb_x<b_width; mb_x++){
3204
                int dia_change, i, j, ref;
3205
                int best_rd= INT_MAX, ref_rd;
3206
                BlockNode backup, ref_b;
3207
                const int index= mb_x + mb_y * b_stride;
3208
                BlockNode *block= &s->block[index];
3209
                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3210
                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3211
                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3212
                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3213
                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3214
                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3215
                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3216
                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3217
                const int b_w= (MB_SIZE >> s->block_max_depth);
3218
                uint8_t obmc_edged[b_w*2][b_w*2];
3219

    
3220
                if(pass && (block->type & BLOCK_OPT))
3221
                    continue;
3222
                block->type |= BLOCK_OPT;
3223

    
3224
                backup= *block;
3225

    
3226
                if(!s->me_cache_generation)
3227
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3228
                s->me_cache_generation += 1<<22;
3229

    
3230
                //FIXME precalc
3231
                {
3232
                    int x, y;
3233
                    memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3234
                    if(mb_x==0)
3235
                        for(y=0; y<b_w*2; y++)
3236
                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3237
                    if(mb_x==b_stride-1)
3238
                        for(y=0; y<b_w*2; y++)
3239
                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3240
                    if(mb_y==0){
3241
                        for(x=0; x<b_w*2; x++)
3242
                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
3243
                        for(y=1; y<b_w; y++)
3244
                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3245
                    }
3246
                    if(mb_y==b_height-1){
3247
                        for(x=0; x<b_w*2; x++)
3248
                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3249
                        for(y=b_w; y<b_w*2-1; y++)
3250
                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3251
                    }
3252
                }
3253

    
3254
                //skip stuff outside the picture
3255
                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3256
                {
3257
                    uint8_t *src= s->  input_picture.data[0];
3258
                    uint8_t *dst= s->current_picture.data[0];
3259
                    const int stride= s->current_picture.linesize[0];
3260
                    const int block_w= MB_SIZE >> s->block_max_depth;
3261
                    const int sx= block_w*mb_x - block_w/2;
3262
                    const int sy= block_w*mb_y - block_w/2;
3263
                    const int w= s->plane[0].width;
3264
                    const int h= s->plane[0].height;
3265
                    int y;
3266

    
3267
                    for(y=sy; y<0; y++)
3268
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3269
                    for(y=h; y<sy+block_w*2; y++)
3270
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3271
                    if(sx<0){
3272
                        for(y=sy; y<sy+block_w*2; y++)
3273
                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3274
                    }
3275
                    if(sx+block_w*2 > w){
3276
                        for(y=sy; y<sy+block_w*2; y++)
3277
                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3278
                    }
3279
                }
3280

    
3281
                // intra(black) = neighbors' contribution to the current block
3282
                for(i=0; i<3; i++)
3283
                    color[i]= get_dc(s, mb_x, mb_y, i);
3284

    
3285
                // get previous score (cant be cached due to OBMC)
3286
                if(pass > 0 && (block->type&BLOCK_INTRA)){
3287
                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
3288
                    check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3289
                }else
3290
                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3291

    
3292
                ref_b= *block;
3293
                ref_rd= best_rd;
3294
                for(ref=0; ref < s->ref_frames; ref++){
3295
                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3296
                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3297
                        continue;
3298
                    block->ref= ref;
3299
                    best_rd= INT_MAX;
3300

    
3301
                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3302
                    check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3303
                    if(tb)
3304
                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3305
                    if(lb)
3306
                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3307
                    if(rb)
3308
                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3309
                    if(bb)
3310
                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3311

    
3312
                    /* fullpel ME */
3313
                    //FIXME avoid subpel interpol / round to nearest integer
3314
                    do{
3315
                        dia_change=0;
3316
                        for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3317
                            for(j=0; j<i; j++){
3318
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3319
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3320
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3321
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3322
                            }
3323
                        }
3324
                    }while(dia_change);
3325
                    /* subpel ME */
3326
                    do{
3327
                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3328
                        dia_change=0;
3329
                        for(i=0; i<8; i++)
3330
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3331
                    }while(dia_change);
3332
                    //FIXME or try the standard 2 pass qpel or similar
3333

    
3334
                    mvr[0][0]= block->mx;
3335
                    mvr[0][1]= block->my;
3336
                    if(ref_rd > best_rd){
3337
                        ref_rd= best_rd;
3338
                        ref_b= *block;
3339
                    }
3340
                }
3341
                best_rd= ref_rd;
3342
                *block= ref_b;
3343
#if 1
3344
                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3345
                //FIXME RD style color selection
3346
#endif
3347
                if(!same_block(block, &backup)){
3348
                    if(tb ) tb ->type &= ~BLOCK_OPT;
3349
                    if(lb ) lb ->type &= ~BLOCK_OPT;
3350
                    if(rb ) rb ->type &= ~BLOCK_OPT;
3351
                    if(bb ) bb ->type &= ~BLOCK_OPT;
3352
                    if(tlb) tlb->type &= ~BLOCK_OPT;
3353
                    if(trb) trb->type &= ~BLOCK_OPT;
3354
                    if(blb) blb->type &= ~BLOCK_OPT;
3355
                    if(brb) brb->type &= ~BLOCK_OPT;
3356
                    change ++;
3357
                }
3358
            }
3359
        }
3360
        av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3361
        if(!change)
3362
            break;
3363
    }
3364

    
3365
    if(s->block_max_depth == 1){
3366
        int change= 0;
3367
        for(mb_y= 0; mb_y<b_height; mb_y+=2){
3368
            for(mb_x= 0; mb_x<b_width; mb_x+=2){
3369
                int i;
3370
                int best_rd, init_rd;
3371
                const int index= mb_x + mb_y * b_stride;
3372
                BlockNode *b[4];
3373

    
3374
                b[0]= &s->block[index];
3375
                b[1]= b[0]+1;
3376
                b[2]= b[0]+b_stride;
3377
                b[3]= b[2]+1;
3378
                if(same_block(b[0], b[1]) &&
3379
                   same_block(b[0], b[2]) &&
3380
                   same_block(b[0], b[3]))
3381
                    continue;
3382

    
3383
                if(!s->me_cache_generation)
3384
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3385
                s->me_cache_generation += 1<<22;
3386

    
3387
                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3388

    
3389
                //FIXME more multiref search?
3390
                check_4block_inter(s, mb_x, mb_y,
3391
                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3392
                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3393

    
3394
                for(i=0; i<4; i++)
3395
                    if(!(b[i]->type&BLOCK_INTRA))
3396
                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3397

    
3398
                if(init_rd != best_rd)
3399
                    change++;
3400
            }
3401
        }
3402
        av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3403
    }
3404
}
3405

    
3406
static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3407
    const int level= b->level;
3408
    const int w= b->width;
3409
    const int h= b->height;
3410
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3411
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3412
    int x,y, thres1, thres2;
3413
//    START_TIMER
3414

    
3415
    if(s->qlog == LOSSLESS_QLOG) return;
3416

    
3417
    bias= bias ? 0 : (3*qmul)>>3;
3418
    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3419
    thres2= 2*thres1;
3420

    
3421
    if(!bias){
3422
        for(y=0; y<h; y++){
3423
            for(x=0; x<w; x++){
3424
                int i= src[x + y*stride];
3425

    
3426
                if((unsigned)(i+thres1) > thres2){
3427
                    if(i>=0){
3428
                        i<<= QEXPSHIFT;
3429
                        i/= qmul; //FIXME optimize
3430
                        src[x + y*stride]=  i;
3431
                    }else{
3432
                        i= -i;
3433
                        i<<= QEXPSHIFT;
3434
                        i/= qmul; //FIXME optimize
3435
                        src[x + y*stride]= -i;
3436
                    }
3437
                }else
3438
                    src[x + y*stride]= 0;
3439
            }
3440
        }
3441
    }else{
3442
        for(y=0; y<h; y++){
3443
            for(x=0; x<w; x++){
3444
                int i= src[x + y*stride];
3445

    
3446
                if((unsigned)(i+thres1) > thres2){
3447
                    if(i>=0){
3448
                        i<<= QEXPSHIFT;
3449
                        i= (i + bias) / qmul; //FIXME optimize
3450
                        src[x + y*stride]=  i;
3451
                    }else{
3452
                        i= -i;
3453
                        i<<= QEXPSHIFT;
3454
                        i= (i + bias) / qmul; //FIXME optimize
3455
                        src[x + y*stride]= -i;
3456
                    }
3457
                }else
3458
                    src[x + y*stride]= 0;
3459
            }
3460
        }
3461
    }
3462
    if(level+1 == s->spatial_decomposition_count){
3463
//        STOP_TIMER("quantize")
3464
    }
3465
}
3466

    
3467
static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3468
    const int w= b->width;
3469
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3470
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3471
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3472
    int x,y;
3473
    START_TIMER
3474

    
3475
    if(s->qlog == LOSSLESS_QLOG) return;
3476

    
3477
    for(y=start_y; y<end_y; y++){
3478
//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3479
        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3480
        for(x=0; x<w; x++){
3481
            int i= line[x];
3482
            if(i<0){
3483
                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3484
            }else if(i>0){
3485
                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3486
            }
3487
        }
3488
    }
3489
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3490
        STOP_TIMER("dquant")
3491
    }
3492
}
3493

    
3494
static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3495
    const int w= b->width;
3496
    const int h= b->height;
3497
    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3498
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3499
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3500
    int x,y;
3501
    START_TIMER
3502

    
3503
    if(s->qlog == LOSSLESS_QLOG) return;
3504

    
3505
    for(y=0; y<h; y++){
3506
        for(x=0; x<w; x++){
3507
            int i= src[x + y*stride];
3508
            if(i<0){
3509
                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3510
            }else if(i>0){
3511
                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3512
            }
3513
        }
3514
    }
3515
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3516
        STOP_TIMER("dquant")
3517
    }
3518
}
3519

    
3520
static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3521
    const int w= b->width;
3522
    const int h= b->height;
3523
    int x,y;
3524

    
3525
    for(y=h-1; y>=0; y--){
3526
        for(x=w-1; x>=0; x--){
3527
            int i= x + y*stride;
3528

    
3529
            if(x){
3530
                if(use_median){
3531
                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3532
                    else  src[i] -= src[i - 1];
3533
                }else{
3534
                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3535
                    else  src[i] -= src[i - 1];
3536
                }
3537
            }else{
3538
                if(y) src[i] -= src[i - stride];
3539
            }
3540
        }
3541
    }
3542
}
3543

    
3544
static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3545
    const int w= b->width;
3546
    int x,y;
3547

    
3548
//    START_TIMER
3549

    
3550
    DWTELEM * line=0; // silence silly "could be used without having been initialized" warning
3551
    DWTELEM * prev;
3552

    
3553
    if (start_y != 0)
3554
        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3555

    
3556
    for(y=start_y; y<end_y; y++){
3557
        prev = line;
3558
//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3559
        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3560
        for(x=0; x<w; x++){
3561
            if(x){
3562
                if(use_median){
3563
                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3564
                    else  line[x] += line[x - 1];
3565
                }else{
3566
                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3567
                    else  line[x] += line[x - 1];
3568
                }
3569
            }else{
3570
                if(y) line[x] += prev[x];
3571
            }
3572
        }
3573
    }
3574

    
3575
//    STOP_TIMER("correlate")
3576
}
3577

    
3578
static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3579
    const int w= b->width;
3580
    const int h= b->height;
3581
    int x,y;
3582

    
3583
    for(y=0; y<h; y++){
3584
        for(x=0; x<w; x++){
3585
            int i= x + y*stride;
3586

    
3587
            if(x){
3588
                if(use_median){
3589
                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3590
                    else  src[i] += src[i - 1];
3591
                }else{
3592
                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3593
                    else  src[i] += src[i - 1];
3594
                }
3595
            }else{
3596
                if(y) src[i] += src[i - stride];
3597
            }
3598
        }
3599
    }
3600
}
3601

    
3602
static void encode_header(SnowContext *s){
3603
    int plane_index, level, orientation;
3604
    uint8_t kstate[32];
3605

    
3606
    memset(kstate, MID_STATE, sizeof(kstate));
3607

    
3608
    put_rac(&s->c, kstate, s->keyframe);
3609
    if(s->keyframe || s->always_reset){
3610
        reset_contexts(s);
3611
        s->last_spatial_decomposition_type=
3612
        s->last_qlog=
3613
        s->last_qbias=
3614
        s->last_mv_scale=
3615
        s->last_block_max_depth= 0;
3616
    }
3617
    if(s->keyframe){
3618
        put_symbol(&s->c, s->header_state, s->version, 0);
3619
        put_rac(&s->c, s->header_state, s->always_reset);
3620
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3621
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3622
        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3623
        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3624
        put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3625
        put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3626
        put_rac(&s->c, s->header_state, s->spatial_scalability);
3627
//        put_rac(&s->c, s->header_state, s->rate_scalability);
3628
        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3629

    
3630
        for(plane_index=0; plane_index<2; plane_index++){
3631
            for(level=0; level<s->spatial_decomposition_count; level++){
3632
                for(orientation=level ? 1:0; orientation<4; orientation++){
3633
                    if(orientation==2) continue;
3634
                    put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3635
                }
3636
            }
3637
        }
3638
    }
3639
    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3640
    put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3641
    put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3642
    put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3643
    put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3644

    
3645
    s->last_spatial_decomposition_type= s->spatial_decomposition_type;
3646
    s->last_qlog                      = s->qlog;
3647
    s->last_qbias                     = s->qbias;
3648
    s->last_mv_scale                  = s->mv_scale;
3649
    s->last_block_max_depth           = s->block_max_depth;
3650
}
3651

    
3652
static int decode_header(SnowContext *s){
3653
    int plane_index, level, orientation;
3654
    uint8_t kstate[32];
3655

    
3656
    memset(kstate, MID_STATE, sizeof(kstate));
3657

    
3658
    s->keyframe= get_rac(&s->c, kstate);
3659
    if(s->keyframe || s->always_reset){
3660
        reset_contexts(s);
3661
        s->spatial_decomposition_type=
3662
        s->qlog=
3663
        s->qbias=
3664
        s->mv_scale=
3665
        s->block_max_depth= 0;
3666
    }
3667
    if(s->keyframe){
3668
        s->version= get_symbol(&s->c, s->header_state, 0);
3669
        if(s->version>0){
3670
            av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3671
            return -1;
3672
        }
3673
        s->always_reset= get_rac(&s->c, s->header_state);
3674
        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3675
        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3676
        s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3677
        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3678
        s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3679
        s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3680
        s->spatial_scalability= get_rac(&s->c, s->header_state);
3681
//        s->rate_scalability= get_rac(&s->c, s->header_state);
3682
        s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3683

    
3684
        for(plane_index=0; plane_index<3; plane_index++){
3685
            for(level=0; level<s->spatial_decomposition_count; level++){
3686
                for(orientation=level ? 1:0; orientation<4; orientation++){
3687
                    int q;
3688
                    if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3689
                    else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3690
                    else                    q= get_symbol(&s->c, s->header_state, 1);
3691
                    s->plane[plane_index].band[level][orientation].qlog= q;
3692
                }
3693
            }
3694
        }
3695
    }
3696

    
3697
    s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3698
    if(s->spatial_decomposition_type > 2){
3699
        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3700
        return -1;
3701
    }
3702

    
3703
    s->qlog           += get_symbol(&s->c, s->header_state, 1);
3704
    s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3705
    s->qbias          += get_symbol(&s->c, s->header_state, 1);
3706
    s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3707
    if(s->block_max_depth > 1 || s->block_max_depth < 0){
3708
        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3709
        s->block_max_depth= 0;
3710
        return -1;
3711
    }
3712

    
3713
    return 0;
3714
}
3715

    
3716
static void init_qexp(void){
3717
    int i;
3718
    double v=128;
3719

    
3720
    for(i=0; i<QROOT; i++){
3721
        qexp[i]= lrintf(v);
3722
        v *= pow(2, 1.0 / QROOT);
3723
    }
3724
}
3725

    
3726
static int common_init(AVCodecContext *avctx){
3727
    SnowContext *s = avctx->priv_data;
3728
    int width, height;
3729
    int level, orientation, plane_index, dec;
3730
    int i, j;
3731

    
3732
    s->avctx= avctx;
3733

    
3734
    dsputil_init(&s->dsp, avctx);
3735

    
3736
#define mcf(dx,dy)\
3737
    s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3738
    s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3739
        s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3740
    s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3741
    s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3742
        s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3743

    
3744
    mcf( 0, 0)
3745
    mcf( 4, 0)
3746
    mcf( 8, 0)
3747
    mcf(12, 0)
3748
    mcf( 0, 4)
3749
    mcf( 4, 4)
3750
    mcf( 8, 4)
3751
    mcf(12, 4)
3752
    mcf( 0, 8)
3753
    mcf( 4, 8)
3754
    mcf( 8, 8)
3755
    mcf(12, 8)
3756
    mcf( 0,12)
3757
    mcf( 4,12)
3758
    mcf( 8,12)
3759
    mcf(12,12)
3760

    
3761
#define mcfh(dx,dy)\
3762
    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3763
    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3764
        mc_block_hpel ## dx ## dy ## 16;\
3765
    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3766
    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3767
        mc_block_hpel ## dx ## dy ## 8;
3768

    
3769
    mcfh(0, 0)
3770
    mcfh(8, 0)
3771
    mcfh(0, 8)
3772
    mcfh(8, 8)
3773

    
3774
    if(!qexp[0])
3775
        init_qexp();
3776

    
3777
    dec= s->spatial_decomposition_count= 5;
3778
    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3779

    
3780
    s->chroma_h_shift= 1; //FIXME XXX
3781
    s->chroma_v_shift= 1;
3782

    
3783
//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3784

    
3785
    width= s->avctx->width;
3786
    height= s->avctx->height;
3787

    
3788
    s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3789

    
3790
    s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3791
    s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3792

    
3793
    for(plane_index=0; plane_index<3; plane_index++){
3794
        int w= s->avctx->width;
3795
        int h= s->avctx->height;
3796

    
3797
        if(plane_index){
3798
            w>>= s->chroma_h_shift;
3799
            h>>= s->chroma_v_shift;
3800
        }
3801
        s->plane[plane_index].width = w;
3802
        s->plane[plane_index].height= h;
3803
//av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3804
        for(level=s->spatial_decomposition_count-1; level>=0; level--){
3805
            for(orientation=level ? 1 : 0; orientation<4; orientation++){
3806
                SubBand *b= &s->plane[plane_index].band[level][orientation];
3807

    
3808
                b->buf= s->spatial_dwt_buffer;
3809
                b->level= level;
3810
                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3811
                b->width = (w + !(orientation&1))>>1;
3812
                b->height= (h + !(orientation>1))>>1;
3813

    
3814
                b->stride_line = 1 << (s->spatial_decomposition_count - level);
3815
                b->buf_x_offset = 0;
3816
                b->buf_y_offset = 0;
3817

    
3818
                if(orientation&1){
3819
                    b->buf += (w+1)>>1;
3820
                    b->buf_x_offset = (w+1)>>1;
3821
                }
3822
                if(orientation>1){
3823
                    b->buf += b->stride>>1;
3824
                    b->buf_y_offset = b->stride_line >> 1;
3825
                }
3826

    
3827
                if(level)
3828
                    b->parent= &s->plane[plane_index].band[level-1][orientation];
3829
                b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3830
            }
3831
            w= (w+1)>>1;
3832
            h= (h+1)>>1;
3833
        }
3834
    }
3835

    
3836
    for(i=0; i<MAX_REF_FRAMES; i++)
3837
        for(j=0; j<MAX_REF_FRAMES; j++)
3838
            scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3839

    
3840
    reset_contexts(s);
3841
/*
3842
    width= s->width= avctx->width;
3843
    height= s->height= avctx->height;
3844

3845
    assert(width && height);
3846
*/
3847
    s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3848

    
3849
    return 0;
3850
}
3851

    
3852
static int qscale2qlog(int qscale){
3853
    return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3854
           + 61*QROOT/8; //<64 >60
3855
}
3856

    
3857
static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3858
{
3859
    /* estimate the frame's complexity as a sum of weighted dwt coefs.
3860
     * FIXME we know exact mv bits at this point,
3861
     * but ratecontrol isn't set up to include them. */
3862
    uint32_t coef_sum= 0;
3863
    int level, orientation, delta_qlog;
3864

    
3865
    for(level=0; level<s->spatial_decomposition_count; level++){
3866
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3867
            SubBand *b= &s->plane[0].band[level][orientation];
3868
            DWTELEM *buf= b->buf;
3869
            const int w= b->width;
3870
            const int h= b->height;
3871
            const int stride= b->stride;
3872
            const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3873
            const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3874
            const int qdiv= (1<<16)/qmul;
3875
            int x, y;
3876
            if(orientation==0)
3877
                decorrelate(s, b, buf, stride, 1, 0);
3878
            for(y=0; y<h; y++)
3879
                for(x=0; x<w; x++)
3880
                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3881
            if(orientation==0)
3882
                correlate(s, b, buf, stride, 1, 0);
3883
        }
3884
    }
3885

    
3886
    /* ugly, ratecontrol just takes a sqrt again */
3887
    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3888
    assert(coef_sum < INT_MAX);
3889

    
3890
    if(pict->pict_type == I_TYPE){
3891
        s->m.current_picture.mb_var_sum= coef_sum;
3892
        s->m.current_picture.mc_mb_var_sum= 0;
3893
    }else{
3894
        s->m.current_picture.mc_mb_var_sum= coef_sum;
3895
        s->m.current_picture.mb_var_sum= 0;
3896
    }
3897

    
3898
    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3899
    if (pict->quality < 0)
3900
        return INT_MIN;
3901
    s->lambda= pict->quality * 3/2;
3902
    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3903
    s->qlog+= delta_qlog;
3904
    return delta_qlog;
3905
}
3906

    
3907
static void calculate_vissual_weight(SnowContext *s, Plane *p){
3908
    int width = p->width;
3909
    int height= p->height;
3910
    int level, orientation, x, y;
3911

    
3912
    for(level=0; level<s->spatial_decomposition_count; level++){
3913
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3914
            SubBand *b= &p->band[level][orientation];
3915
            DWTELEM *buf= b->buf;
3916
            int64_t error=0;
3917

    
3918
            memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3919
            buf[b->width/2 + b->height/2*b->stride]= 256*256;
3920
            ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3921
            for(y=0; y<height; y++){
3922
                for(x=0; x<width; x++){
3923
                    int64_t d= s->spatial_dwt_buffer[x + y*width];
3924
                    error += d*d;
3925
                }
3926
            }
3927

    
3928
            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3929
//            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3930
        }
3931
    }
3932
}
3933

    
3934
static int encode_init(AVCodecContext *avctx)
3935
{
3936
    SnowContext *s = avctx->priv_data;
3937
    int plane_index;
3938

    
3939
    if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3940
        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3941
               "use vstrict=-2 / -strict -2 to use it anyway\n");
3942
        return -1;
3943
    }
3944

    
3945
    if(avctx->prediction_method == DWT_97
3946
       && (avctx->flags & CODEC_FLAG_QSCALE)
3947
       && avctx->global_quality == 0){
3948
        av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
3949
        return -1;
3950
    }
3951

    
3952
    common_init(avctx);
3953
    alloc_blocks(s);
3954

    
3955
    s->version=0;
3956

    
3957
    s->m.avctx   = avctx;
3958
    s->m.flags   = avctx->flags;
3959
    s->m.bit_rate= avctx->bit_rate;
3960

    
3961
    s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3962
    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3963
    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3964
    s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3965
    h263_encode_init(&s->m); //mv_penalty
3966

    
3967
    s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
3968

    
3969
    if(avctx->flags&CODEC_FLAG_PASS1){
3970
        if(!avctx->stats_out)
3971
            avctx->stats_out = av_mallocz(256);
3972
    }
3973
    if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
3974
        if(ff_rate_control_init(&s->m) < 0)
3975
            return -1;
3976
    }
3977
    s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
3978

    
3979
    for(plane_index=0; plane_index<3; plane_index++){
3980
        calculate_vissual_weight(s, &s->plane[plane_index]);
3981
    }
3982

    
3983

    
3984
    avctx->coded_frame= &s->current_picture;
3985
    switch(avctx->pix_fmt){
3986
//    case PIX_FMT_YUV444P:
3987
//    case PIX_FMT_YUV422P:
3988
    case PIX_FMT_YUV420P:
3989
    case PIX_FMT_GRAY8:
3990
//    case PIX_FMT_YUV411P:
3991
//    case PIX_FMT_YUV410P:
3992
        s->colorspace_type= 0;
3993
        break;
3994
/*    case PIX_FMT_RGB32:
3995
        s->colorspace= 1;
3996
        break;*/
3997
    default:
3998
        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
3999
        return -1;
4000
    }
4001
//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
4002
    s->chroma_h_shift= 1;
4003
    s->chroma_v_shift= 1;
4004

    
4005
    ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4006
    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4007

    
4008
    s->avctx->get_buffer(s->avctx, &s->input_picture);
4009

    
4010
    if(s->avctx->me_method == ME_ITER){
4011
        int i;
4012
        int size= s->b_width * s->b_height << 2*s->block_max_depth;
4013
        for(i=0; i<s->max_ref_frames; i++){
4014
            s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4015
            s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4016
        }
4017
    }
4018

    
4019
    return 0;
4020
}
4021

    
4022
static int frame_start(SnowContext *s){
4023
   AVFrame tmp;
4024
   int w= s->avctx->width; //FIXME round up to x16 ?
4025
   int h= s->avctx->height;
4026

    
4027
    if(s->current_picture.data[0]){
4028
        draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4029
        draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4030
        draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4031
    }
4032

    
4033
    tmp= s->last_picture[s->max_ref_frames-1];
4034
    memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4035
    s->last_picture[0]= s->current_picture;
4036
    s->current_picture= tmp;
4037

    
4038
    if(s->keyframe){
4039
        s->ref_frames= 0;
4040
    }else{
4041
        int i;
4042
        for(i=0; i<s->max_ref_frames && s->last_picture[i].data[