Statistics
| Branch: | Revision:

ffmpeg / libavcodec / snow.c @ 396a5e68

History | View | Annotate | Download (167 KB)

1
/*
2
 * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "avcodec.h"
22
#include "common.h"
23
#include "dsputil.h"
24
#include "snow.h"
25

    
26
#include "rangecoder.h"
27

    
28
#include "mpegvideo.h"
29

    
30
#undef NDEBUG
31
#include <assert.h>
32

    
33
static const int8_t quant3[256]={
34
 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
43
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
44
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
45
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
46
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
47
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
48
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
49
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
50
};
51
static const int8_t quant3b[256]={
52
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
61
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
62
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
63
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
64
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
65
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
66
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
67
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
68
};
69
static const int8_t quant3bA[256]={
70
 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
71
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
72
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
73
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
74
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
75
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
76
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
77
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
78
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
79
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
80
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
81
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
82
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
83
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
84
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
85
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
86
};
87
static const int8_t quant5[256]={
88
 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
91
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
92
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
93
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
94
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
95
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
96
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
97
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
98
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
99
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
100
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
101
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
102
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
103
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
104
};
105
static const int8_t quant7[256]={
106
 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
108
 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
109
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
110
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
111
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
112
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
113
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
114
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
115
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
116
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
117
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
118
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
119
-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
120
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
121
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
122
};
123
static const int8_t quant9[256]={
124
 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
125
 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
126
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
127
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
128
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
129
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
130
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
131
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
132
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
133
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
134
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
135
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
136
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
137
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
138
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
139
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
140
};
141
static const int8_t quant11[256]={
142
 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
143
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
144
 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
145
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
146
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
147
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
148
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
149
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
150
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
151
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
152
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
153
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
154
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
155
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
156
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
157
-4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
158
};
159
static const int8_t quant13[256]={
160
 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
161
 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
162
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
163
 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
164
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
165
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
166
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
167
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
168
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
169
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
170
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
171
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
172
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
173
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
174
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
175
-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
176
};
177

    
178
#if 0 //64*cubic
179
static const uint8_t obmc32[1024]={
180
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
182
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
183
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
184
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
185
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
186
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
187
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
188
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
189
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
190
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
191
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
192
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
193
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
194
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
195
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
196
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
197
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
198
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
199
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
200
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
201
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
202
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
203
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
204
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
205
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
206
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
207
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
208
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
209
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
210
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
211
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212
//error:0.000022
213
};
214
static const uint8_t obmc16[256]={
215
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
216
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
217
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
218
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
219
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
220
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
221
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
222
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
223
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
224
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
225
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
226
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
227
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
228
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
229
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
230
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
231
//error:0.000033
232
};
233
#elif 1 // 64*linear
234
static const uint8_t obmc32[1024]={
235
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
236
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
237
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
238
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
239
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
240
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
241
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
242
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
243
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
244
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
245
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
246
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
247
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
248
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
249
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
250
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
251
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
252
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
253
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
254
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
255
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
256
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
257
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
258
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
259
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
260
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
261
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
262
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
263
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
264
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
265
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
266
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
267
 //error:0.000020
268
};
269
static const uint8_t obmc16[256]={
270
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
271
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
272
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
273
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
274
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
275
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
276
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
277
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
278
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
279
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
280
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
281
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
282
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
283
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
284
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
285
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
286
//error:0.000015
287
};
288
#else //64*cos
289
static const uint8_t obmc32[1024]={
290
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
291
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
292
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
293
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
294
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
295
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
296
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
297
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
298
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
299
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
300
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
301
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
302
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
303
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
304
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
305
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
306
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
307
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
308
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
309
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
310
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
311
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
312
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
313
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
314
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
315
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
316
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
317
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
318
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
319
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
320
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
321
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
322
//error:0.000022
323
};
324
static const uint8_t obmc16[256]={
325
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
326
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
327
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
328
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
329
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
330
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
331
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
332
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
333
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
334
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
335
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
336
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
337
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
338
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
339
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
340
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
341
//error:0.000022
342
};
343
#endif
344

    
345
//linear *64
346
static const uint8_t obmc8[64]={
347
  4, 12, 20, 28, 28, 20, 12,  4,
348
 12, 36, 60, 84, 84, 60, 36, 12,
349
 20, 60,100,140,140,100, 60, 20,
350
 28, 84,140,196,196,140, 84, 28,
351
 28, 84,140,196,196,140, 84, 28,
352
 20, 60,100,140,140,100, 60, 20,
353
 12, 36, 60, 84, 84, 60, 36, 12,
354
  4, 12, 20, 28, 28, 20, 12,  4,
355
//error:0.000000
356
};
357

    
358
//linear *64
359
static const uint8_t obmc4[16]={
360
 16, 48, 48, 16,
361
 48,144,144, 48,
362
 48,144,144, 48,
363
 16, 48, 48, 16,
364
//error:0.000000
365
};
366

    
367
static const uint8_t *obmc_tab[4]={
368
    obmc32, obmc16, obmc8, obmc4
369
};
370

    
371
static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
372

    
373
typedef struct BlockNode{
374
    int16_t mx;
375
    int16_t my;
376
    uint8_t ref;
377
    uint8_t color[3];
378
    uint8_t type;
379
//#define TYPE_SPLIT    1
380
#define BLOCK_INTRA   1
381
#define BLOCK_OPT     2
382
//#define TYPE_NOCOLOR  4
383
    uint8_t level; //FIXME merge into type?
384
}BlockNode;
385

    
386
static const BlockNode null_block= { //FIXME add border maybe
387
    .color= {128,128,128},
388
    .mx= 0,
389
    .my= 0,
390
    .ref= 0,
391
    .type= 0,
392
    .level= 0,
393
};
394

    
395
#define LOG2_MB_SIZE 4
396
#define MB_SIZE (1<<LOG2_MB_SIZE)
397

    
398
typedef struct x_and_coeff{
399
    int16_t x;
400
    uint16_t coeff;
401
} x_and_coeff;
402

    
403
typedef struct SubBand{
404
    int level;
405
    int stride;
406
    int width;
407
    int height;
408
    int qlog;                                   ///< log(qscale)/log[2^(1/6)]
409
    DWTELEM *buf;
410
    int buf_x_offset;
411
    int buf_y_offset;
412
    int stride_line; ///< Stride measured in lines, not pixels.
413
    x_and_coeff * x_coeff;
414
    struct SubBand *parent;
415
    uint8_t state[/*7*2*/ 7 + 512][32];
416
}SubBand;
417

    
418
typedef struct Plane{
419
    int width;
420
    int height;
421
    SubBand band[MAX_DECOMPOSITIONS][4];
422
}Plane;
423

    
424
typedef struct SnowContext{
425
//    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
426

    
427
    AVCodecContext *avctx;
428
    RangeCoder c;
429
    DSPContext dsp;
430
    AVFrame new_picture;
431
    AVFrame input_picture;              ///< new_picture with the internal linesizes
432
    AVFrame current_picture;
433
    AVFrame last_picture[MAX_REF_FRAMES];
434
    AVFrame mconly_picture;
435
//     uint8_t q_context[16];
436
    uint8_t header_state[32];
437
    uint8_t block_state[128 + 32*128];
438
    int keyframe;
439
    int always_reset;
440
    int version;
441
    int spatial_decomposition_type;
442
    int last_spatial_decomposition_type;
443
    int temporal_decomposition_type;
444
    int spatial_decomposition_count;
445
    int temporal_decomposition_count;
446
    int max_ref_frames;
447
    int ref_frames;
448
    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
449
    uint32_t *ref_scores[MAX_REF_FRAMES];
450
    DWTELEM *spatial_dwt_buffer;
451
    int colorspace_type;
452
    int chroma_h_shift;
453
    int chroma_v_shift;
454
    int spatial_scalability;
455
    int qlog;
456
    int last_qlog;
457
    int lambda;
458
    int lambda2;
459
    int pass1_rc;
460
    int mv_scale;
461
    int last_mv_scale;
462
    int qbias;
463
    int last_qbias;
464
#define QBIAS_SHIFT 3
465
    int b_width;
466
    int b_height;
467
    int block_max_depth;
468
    int last_block_max_depth;
469
    Plane plane[MAX_PLANES];
470
    BlockNode *block;
471
#define ME_CACHE_SIZE 1024
472
    int me_cache[ME_CACHE_SIZE];
473
    int me_cache_generation;
474
    slice_buffer sb;
475

    
476
    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
477
}SnowContext;
478

    
479
typedef struct {
480
    DWTELEM *b0;
481
    DWTELEM *b1;
482
    DWTELEM *b2;
483
    DWTELEM *b3;
484
    int y;
485
} dwt_compose_t;
486

    
487
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
488
//#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
489

    
490
static void iterative_me(SnowContext *s);
491

    
492
static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
493
{
494
    int i;
495

    
496
    buf->base_buffer = base_buffer;
497
    buf->line_count = line_count;
498
    buf->line_width = line_width;
499
    buf->data_count = max_allocated_lines;
500
    buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
501
    buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
502

    
503
    for (i = 0; i < max_allocated_lines; i++)
504
    {
505
      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
506
    }
507

    
508
    buf->data_stack_top = max_allocated_lines - 1;
509
}
510

    
511
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
512
{
513
    int offset;
514
    DWTELEM * buffer;
515

    
516
//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
517

    
518
    assert(buf->data_stack_top >= 0);
519
//  assert(!buf->line[line]);
520
    if (buf->line[line])
521
        return buf->line[line];
522

    
523
    offset = buf->line_width * line;
524
    buffer = buf->data_stack[buf->data_stack_top];
525
    buf->data_stack_top--;
526
    buf->line[line] = buffer;
527

    
528
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
529

    
530
    return buffer;
531
}
532

    
533
static void slice_buffer_release(slice_buffer * buf, int line)
534
{
535
    int offset;
536
    DWTELEM * buffer;
537

    
538
    assert(line >= 0 && line < buf->line_count);
539
    assert(buf->line[line]);
540

    
541
    offset = buf->line_width * line;
542
    buffer = buf->line[line];
543
    buf->data_stack_top++;
544
    buf->data_stack[buf->data_stack_top] = buffer;
545
    buf->line[line] = NULL;
546

    
547
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
548
}
549

    
550
static void slice_buffer_flush(slice_buffer * buf)
551
{
552
    int i;
553
    for (i = 0; i < buf->line_count; i++)
554
    {
555
        if (buf->line[i])
556
        {
557
//      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
558
            slice_buffer_release(buf, i);
559
        }
560
    }
561
}
562

    
563
static void slice_buffer_destroy(slice_buffer * buf)
564
{
565
    int i;
566
    slice_buffer_flush(buf);
567

    
568
    for (i = buf->data_count - 1; i >= 0; i--)
569
    {
570
        assert(buf->data_stack[i]);
571
        av_freep(&buf->data_stack[i]);
572
    }
573
    assert(buf->data_stack);
574
    av_freep(&buf->data_stack);
575
    assert(buf->line);
576
    av_freep(&buf->line);
577
}
578

    
579
#ifdef __sgi
580
// Avoid a name clash on SGI IRIX
581
#undef qexp
582
#endif
583
#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
584
static uint8_t qexp[QROOT];
585

    
586
static inline int mirror(int v, int m){
587
    while((unsigned)v > (unsigned)m){
588
        v=-v;
589
        if(v<0) v+= 2*m;
590
    }
591
    return v;
592
}
593

    
594
static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
595
    int i;
596

    
597
    if(v){
598
        const int a= FFABS(v);
599
        const int e= av_log2(a);
600
#if 1
601
        const int el= FFMIN(e, 10);
602
        put_rac(c, state+0, 0);
603

    
604
        for(i=0; i<el; i++){
605
            put_rac(c, state+1+i, 1);  //1..10
606
        }
607
        for(; i<e; i++){
608
            put_rac(c, state+1+9, 1);  //1..10
609
        }
610
        put_rac(c, state+1+FFMIN(i,9), 0);
611

    
612
        for(i=e-1; i>=el; i--){
613
            put_rac(c, state+22+9, (a>>i)&1); //22..31
614
        }
615
        for(; i>=0; i--){
616
            put_rac(c, state+22+i, (a>>i)&1); //22..31
617
        }
618

    
619
        if(is_signed)
620
            put_rac(c, state+11 + el, v < 0); //11..21
621
#else
622

    
623
        put_rac(c, state+0, 0);
624
        if(e<=9){
625
            for(i=0; i<e; i++){
626
                put_rac(c, state+1+i, 1);  //1..10
627
            }
628
            put_rac(c, state+1+i, 0);
629

    
630
            for(i=e-1; i>=0; i--){
631
                put_rac(c, state+22+i, (a>>i)&1); //22..31
632
            }
633

    
634
            if(is_signed)
635
                put_rac(c, state+11 + e, v < 0); //11..21
636
        }else{
637
            for(i=0; i<e; i++){
638
                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
639
            }
640
            put_rac(c, state+1+FFMIN(i,9), 0);
641

    
642
            for(i=e-1; i>=0; i--){
643
                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
644
            }
645

    
646
            if(is_signed)
647
                put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
648
        }
649
#endif
650
    }else{
651
        put_rac(c, state+0, 1);
652
    }
653
}
654

    
655
static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
656
    if(get_rac(c, state+0))
657
        return 0;
658
    else{
659
        int i, e, a;
660
        e= 0;
661
        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
662
            e++;
663
        }
664

    
665
        a= 1;
666
        for(i=e-1; i>=0; i--){
667
            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
668
        }
669

    
670
        if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
671
            return -a;
672
        else
673
            return a;
674
    }
675
}
676

    
677
static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
678
    int i;
679
    int r= log2>=0 ? 1<<log2 : 1;
680

    
681
    assert(v>=0);
682
    assert(log2>=-4);
683

    
684
    while(v >= r){
685
        put_rac(c, state+4+log2, 1);
686
        v -= r;
687
        log2++;
688
        if(log2>0) r+=r;
689
    }
690
    put_rac(c, state+4+log2, 0);
691

    
692
    for(i=log2-1; i>=0; i--){
693
        put_rac(c, state+31-i, (v>>i)&1);
694
    }
695
}
696

    
697
static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
698
    int i;
699
    int r= log2>=0 ? 1<<log2 : 1;
700
    int v=0;
701

    
702
    assert(log2>=-4);
703

    
704
    while(get_rac(c, state+4+log2)){
705
        v+= r;
706
        log2++;
707
        if(log2>0) r+=r;
708
    }
709

    
710
    for(i=log2-1; i>=0; i--){
711
        v+= get_rac(c, state+31-i)<<i;
712
    }
713

    
714
    return v;
715
}
716

    
717
static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
718
    const int mirror_left= !highpass;
719
    const int mirror_right= (width&1) ^ highpass;
720
    const int w= (width>>1) - 1 + (highpass & width);
721
    int i;
722

    
723
#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
724
    if(mirror_left){
725
        dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
726
        dst += dst_step;
727
        src += src_step;
728
    }
729

    
730
    for(i=0; i<w; i++){
731
        dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
732
    }
733

    
734
    if(mirror_right){
735
        dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
736
    }
737
}
738

    
739
#ifndef lift5
740
static av_always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
741
    const int mirror_left= !highpass;
742
    const int mirror_right= (width&1) ^ highpass;
743
    const int w= (width>>1) - 1 + (highpass & width);
744
    int i;
745

    
746
    if(mirror_left){
747
        int r= 3*2*ref[0];
748
        r += r>>4;
749
        r += r>>8;
750
        dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
751
        dst += dst_step;
752
        src += src_step;
753
    }
754

    
755
    for(i=0; i<w; i++){
756
        int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
757
        r += r>>4;
758
        r += r>>8;
759
        dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
760
    }
761

    
762
    if(mirror_right){
763
        int r= 3*2*ref[w*ref_step];
764
        r += r>>4;
765
        r += r>>8;
766
        dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
767
    }
768
}
769
#endif
770

    
771
#ifndef liftS
772
static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
773
    const int mirror_left= !highpass;
774
    const int mirror_right= (width&1) ^ highpass;
775
    const int w= (width>>1) - 1 + (highpass & width);
776
    int i;
777

    
778
    assert(shift == 4);
779
#define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
780
    if(mirror_left){
781
        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
782
        dst += dst_step;
783
        src += src_step;
784
    }
785

    
786
    for(i=0; i<w; i++){
787
        dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
788
    }
789

    
790
    if(mirror_right){
791
        dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
792
    }
793
}
794
#endif
795

    
796

    
797
static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
798
    int x, i;
799

    
800
    for(x=start; x<width; x+=2){
801
        int64_t sum=0;
802

    
803
        for(i=0; i<n; i++){
804
            int x2= x + 2*i - n + 1;
805
            if     (x2<     0) x2= -x2;
806
            else if(x2>=width) x2= 2*width-x2-2;
807
            sum += coeffs[i]*(int64_t)dst[x2];
808
        }
809
        if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
810
        else        dst[x] += (sum + (1<<shift)/2)>>shift;
811
    }
812
}
813

    
814
static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
815
    int x, y, i;
816
    for(y=start; y<height; y+=2){
817
        for(x=0; x<width; x++){
818
            int64_t sum=0;
819

    
820
            for(i=0; i<n; i++){
821
                int y2= y + 2*i - n + 1;
822
                if     (y2<      0) y2= -y2;
823
                else if(y2>=height) y2= 2*height-y2-2;
824
                sum += coeffs[i]*(int64_t)dst[x + y2*stride];
825
            }
826
            if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
827
            else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
828
        }
829
    }
830
}
831

    
832
#define SCALEX 1
833
#define LX0 0
834
#define LX1 1
835

    
836
#if 0 // more accurate 9/7
837
#define N1 2
838
#define SHIFT1 14
839
#define COEFFS1 (int[]){-25987,-25987}
840
#define N2 2
841
#define SHIFT2 19
842
#define COEFFS2 (int[]){-27777,-27777}
843
#define N3 2
844
#define SHIFT3 15
845
#define COEFFS3 (int[]){28931,28931}
846
#define N4 2
847
#define SHIFT4 15
848
#define COEFFS4 (int[]){14533,14533}
849
#elif 1 // 13/7 CRF
850
#define N1 4
851
#define SHIFT1 4
852
#define COEFFS1 (int[]){1,-9,-9,1}
853
#define N2 4
854
#define SHIFT2 4
855
#define COEFFS2 (int[]){-1,5,5,-1}
856
#define N3 0
857
#define SHIFT3 1
858
#define COEFFS3 NULL
859
#define N4 0
860
#define SHIFT4 1
861
#define COEFFS4 NULL
862
#elif 1 // 3/5
863
#define LX0 1
864
#define LX1 0
865
#define SCALEX 0.5
866
#define N1 2
867
#define SHIFT1 1
868
#define COEFFS1 (int[]){1,1}
869
#define N2 2
870
#define SHIFT2 2
871
#define COEFFS2 (int[]){-1,-1}
872
#define N3 0
873
#define SHIFT3 0
874
#define COEFFS3 NULL
875
#define N4 0
876
#define SHIFT4 0
877
#define COEFFS4 NULL
878
#elif 1 // 11/5
879
#define N1 0
880
#define SHIFT1 1
881
#define COEFFS1 NULL
882
#define N2 2
883
#define SHIFT2 2
884
#define COEFFS2 (int[]){-1,-1}
885
#define N3 2
886
#define SHIFT3 0
887
#define COEFFS3 (int[]){-1,-1}
888
#define N4 4
889
#define SHIFT4 7
890
#define COEFFS4 (int[]){-5,29,29,-5}
891
#define SCALEX 4
892
#elif 1 // 9/7 CDF
893
#define N1 2
894
#define SHIFT1 7
895
#define COEFFS1 (int[]){-203,-203}
896
#define N2 2
897
#define SHIFT2 12
898
#define COEFFS2 (int[]){-217,-217}
899
#define N3 2
900
#define SHIFT3 7
901
#define COEFFS3 (int[]){113,113}
902
#define N4 2
903
#define SHIFT4 9
904
#define COEFFS4 (int[]){227,227}
905
#define SCALEX 1
906
#elif 1 // 7/5 CDF
907
#define N1 0
908
#define SHIFT1 1
909
#define COEFFS1 NULL
910
#define N2 2
911
#define SHIFT2 2
912
#define COEFFS2 (int[]){-1,-1}
913
#define N3 2
914
#define SHIFT3 0
915
#define COEFFS3 (int[]){-1,-1}
916
#define N4 2
917
#define SHIFT4 4
918
#define COEFFS4 (int[]){3,3}
919
#elif 1 // 9/7 MN
920
#define N1 4
921
#define SHIFT1 4
922
#define COEFFS1 (int[]){1,-9,-9,1}
923
#define N2 2
924
#define SHIFT2 2
925
#define COEFFS2 (int[]){1,1}
926
#define N3 0
927
#define SHIFT3 1
928
#define COEFFS3 NULL
929
#define N4 0
930
#define SHIFT4 1
931
#define COEFFS4 NULL
932
#else // 13/7 CRF
933
#define N1 4
934
#define SHIFT1 4
935
#define COEFFS1 (int[]){1,-9,-9,1}
936
#define N2 4
937
#define SHIFT2 4
938
#define COEFFS2 (int[]){-1,5,5,-1}
939
#define N3 0
940
#define SHIFT3 1
941
#define COEFFS3 NULL
942
#define N4 0
943
#define SHIFT4 1
944
#define COEFFS4 NULL
945
#endif
946
static void horizontal_decomposeX(DWTELEM *b, int width){
947
    DWTELEM temp[width];
948
    const int width2= width>>1;
949
    const int w2= (width+1)>>1;
950
    int x;
951

    
952
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
953
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
954
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
955
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
956

    
957
    for(x=0; x<width2; x++){
958
        temp[x   ]= b[2*x    ];
959
        temp[x+w2]= b[2*x + 1];
960
    }
961
    if(width&1)
962
        temp[x   ]= b[2*x    ];
963
    memcpy(b, temp, width*sizeof(int));
964
}
965

    
966
static void horizontal_composeX(DWTELEM *b, int width){
967
    DWTELEM temp[width];
968
    const int width2= width>>1;
969
    int x;
970
    const int w2= (width+1)>>1;
971

    
972
    memcpy(temp, b, width*sizeof(int));
973
    for(x=0; x<width2; x++){
974
        b[2*x    ]= temp[x   ];
975
        b[2*x + 1]= temp[x+w2];
976
    }
977
    if(width&1)
978
        b[2*x    ]= temp[x   ];
979

    
980
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
981
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
982
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
983
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
984
}
985

    
986
static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
987
    int x, y;
988

    
989
    for(y=0; y<height; y++){
990
        for(x=0; x<width; x++){
991
            buffer[y*stride + x] *= SCALEX;
992
        }
993
    }
994

    
995
    for(y=0; y<height; y++){
996
        horizontal_decomposeX(buffer + y*stride, width);
997
    }
998

    
999
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
1000
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
1001
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
1002
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
1003
}
1004

    
1005
static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
1006
    int x, y;
1007

    
1008
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1009
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1010
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1011
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1012

    
1013
    for(y=0; y<height; y++){
1014
        horizontal_composeX(buffer + y*stride, width);
1015
    }
1016

    
1017
    for(y=0; y<height; y++){
1018
        for(x=0; x<width; x++){
1019
            buffer[y*stride + x] /= SCALEX;
1020
        }
1021
    }
1022
}
1023

    
1024
static void horizontal_decompose53i(DWTELEM *b, int width){
1025
    DWTELEM temp[width];
1026
    const int width2= width>>1;
1027
    int x;
1028
    const int w2= (width+1)>>1;
1029

    
1030
    for(x=0; x<width2; x++){
1031
        temp[x   ]= b[2*x    ];
1032
        temp[x+w2]= b[2*x + 1];
1033
    }
1034
    if(width&1)
1035
        temp[x   ]= b[2*x    ];
1036
#if 0
1037
    {
1038
    int A1,A2,A3,A4;
1039
    A2= temp[1       ];
1040
    A4= temp[0       ];
1041
    A1= temp[0+width2];
1042
    A1 -= (A2 + A4)>>1;
1043
    A4 += (A1 + 1)>>1;
1044
    b[0+width2] = A1;
1045
    b[0       ] = A4;
1046
    for(x=1; x+1<width2; x+=2){
1047
        A3= temp[x+width2];
1048
        A4= temp[x+1     ];
1049
        A3 -= (A2 + A4)>>1;
1050
        A2 += (A1 + A3 + 2)>>2;
1051
        b[x+width2] = A3;
1052
        b[x       ] = A2;
1053

1054
        A1= temp[x+1+width2];
1055
        A2= temp[x+2       ];
1056
        A1 -= (A2 + A4)>>1;
1057
        A4 += (A1 + A3 + 2)>>2;
1058
        b[x+1+width2] = A1;
1059
        b[x+1       ] = A4;
1060
    }
1061
    A3= temp[width-1];
1062
    A3 -= A2;
1063
    A2 += (A1 + A3 + 2)>>2;
1064
    b[width -1] = A3;
1065
    b[width2-1] = A2;
1066
    }
1067
#else
1068
    lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1069
    lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1070
#endif
1071
}
1072

    
1073
static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1074
    int i;
1075

    
1076
    for(i=0; i<width; i++){
1077
        b1[i] -= (b0[i] + b2[i])>>1;
1078
    }
1079
}
1080

    
1081
static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1082
    int i;
1083

    
1084
    for(i=0; i<width; i++){
1085
        b1[i] += (b0[i] + b2[i] + 2)>>2;
1086
    }
1087
}
1088

    
1089
static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1090
    int y;
1091
    DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1092
    DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1093

    
1094
    for(y=-2; y<height; y+=2){
1095
        DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1096
        DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1097

    
1098
{START_TIMER
1099
        if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1100
        if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1101
STOP_TIMER("horizontal_decompose53i")}
1102

    
1103
{START_TIMER
1104
        if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1105
        if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1106
STOP_TIMER("vertical_decompose53i*")}
1107

    
1108
        b0=b2;
1109
        b1=b3;
1110
    }
1111
}
1112

    
1113
static void horizontal_decompose97i(DWTELEM *b, int width){
1114
    DWTELEM temp[width];
1115
    const int w2= (width+1)>>1;
1116

    
1117
    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1118
    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1119
    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1120
    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1121
}
1122

    
1123

    
1124
static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1125
    int i;
1126

    
1127
    for(i=0; i<width; i++){
1128
        b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1129
    }
1130
}
1131

    
1132
static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1133
    int i;
1134

    
1135
    for(i=0; i<width; i++){
1136
#ifdef lift5
1137
        b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1138
#else
1139
        int r= 3*(b0[i] + b2[i]);
1140
        r+= r>>4;
1141
        r+= r>>8;
1142
        b1[i] += (r+W_CO)>>W_CS;
1143
#endif
1144
    }
1145
}
1146

    
1147
static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1148
    int i;
1149

    
1150
    for(i=0; i<width; i++){
1151
#ifdef liftS
1152
        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1153
#else
1154
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1155
#endif
1156
    }
1157
}
1158

    
1159
static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1160
    int i;
1161

    
1162
    for(i=0; i<width; i++){
1163
        b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1164
    }
1165
}
1166

    
1167
static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1168
    int y;
1169
    DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1170
    DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1171
    DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1172
    DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1173

    
1174
    for(y=-4; y<height; y+=2){
1175
        DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1176
        DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1177

    
1178
{START_TIMER
1179
        if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1180
        if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1181
if(width>400){
1182
STOP_TIMER("horizontal_decompose97i")
1183
}}
1184

    
1185
{START_TIMER
1186
        if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1187
        if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1188
        if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1189
        if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1190

    
1191
if(width>400){
1192
STOP_TIMER("vertical_decompose97i")
1193
}}
1194

    
1195
        b0=b2;
1196
        b1=b3;
1197
        b2=b4;
1198
        b3=b5;
1199
    }
1200
}
1201

    
1202
void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1203
    int level;
1204

    
1205
    for(level=0; level<decomposition_count; level++){
1206
        switch(type){
1207
        case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1208
        case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1209
        case DWT_X: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1210
        }
1211
    }
1212
}
1213

    
1214
static void horizontal_compose53i(DWTELEM *b, int width){
1215
    DWTELEM temp[width];
1216
    const int width2= width>>1;
1217
    const int w2= (width+1)>>1;
1218
    int x;
1219

    
1220
#if 0
1221
    int A1,A2,A3,A4;
1222
    A2= temp[1       ];
1223
    A4= temp[0       ];
1224
    A1= temp[0+width2];
1225
    A1 -= (A2 + A4)>>1;
1226
    A4 += (A1 + 1)>>1;
1227
    b[0+width2] = A1;
1228
    b[0       ] = A4;
1229
    for(x=1; x+1<width2; x+=2){
1230
        A3= temp[x+width2];
1231
        A4= temp[x+1     ];
1232
        A3 -= (A2 + A4)>>1;
1233
        A2 += (A1 + A3 + 2)>>2;
1234
        b[x+width2] = A3;
1235
        b[x       ] = A2;
1236

1237
        A1= temp[x+1+width2];
1238
        A2= temp[x+2       ];
1239
        A1 -= (A2 + A4)>>1;
1240
        A4 += (A1 + A3 + 2)>>2;
1241
        b[x+1+width2] = A1;
1242
        b[x+1       ] = A4;
1243
    }
1244
    A3= temp[width-1];
1245
    A3 -= A2;
1246
    A2 += (A1 + A3 + 2)>>2;
1247
    b[width -1] = A3;
1248
    b[width2-1] = A2;
1249
#else
1250
    lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1251
    lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1252
#endif
1253
    for(x=0; x<width2; x++){
1254
        b[2*x    ]= temp[x   ];
1255
        b[2*x + 1]= temp[x+w2];
1256
    }
1257
    if(width&1)
1258
        b[2*x    ]= temp[x   ];
1259
}
1260

    
1261
static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1262
    int i;
1263

    
1264
    for(i=0; i<width; i++){
1265
        b1[i] += (b0[i] + b2[i])>>1;
1266
    }
1267
}
1268

    
1269
static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1270
    int i;
1271

    
1272
    for(i=0; i<width; i++){
1273
        b1[i] -= (b0[i] + b2[i] + 2)>>2;
1274
    }
1275
}
1276

    
1277
static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1278
    cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1279
    cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1280
    cs->y = -1;
1281
}
1282

    
1283
static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1284
    cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1285
    cs->b1 = buffer + mirror(-1  , height-1)*stride;
1286
    cs->y = -1;
1287
}
1288

    
1289
static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1290
    int y= cs->y;
1291

    
1292
    DWTELEM *b0= cs->b0;
1293
    DWTELEM *b1= cs->b1;
1294
    DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1295
    DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1296

    
1297
{START_TIMER
1298
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1299
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1300
STOP_TIMER("vertical_compose53i*")}
1301

    
1302
{START_TIMER
1303
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1304
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1305
STOP_TIMER("horizontal_compose53i")}
1306

    
1307
    cs->b0 = b2;
1308
    cs->b1 = b3;
1309
    cs->y += 2;
1310
}
1311

    
1312
static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1313
    int y= cs->y;
1314
    DWTELEM *b0= cs->b0;
1315
    DWTELEM *b1= cs->b1;
1316
    DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1317
    DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1318

    
1319
{START_TIMER
1320
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1321
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1322
STOP_TIMER("vertical_compose53i*")}
1323

    
1324
{START_TIMER
1325
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1326
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1327
STOP_TIMER("horizontal_compose53i")}
1328

    
1329
    cs->b0 = b2;
1330
    cs->b1 = b3;
1331
    cs->y += 2;
1332
}
1333

    
1334
static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1335
    dwt_compose_t cs;
1336
    spatial_compose53i_init(&cs, buffer, height, stride);
1337
    while(cs.y <= height)
1338
        spatial_compose53i_dy(&cs, buffer, width, height, stride);
1339
}
1340

    
1341

    
1342
void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1343
    DWTELEM temp[width];
1344
    const int w2= (width+1)>>1;
1345

    
1346
    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1347
    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1348
    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1349
    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1350
}
1351

    
1352
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1353
    int i;
1354

    
1355
    for(i=0; i<width; i++){
1356
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1357
    }
1358
}
1359

    
1360
static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1361
    int i;
1362

    
1363
    for(i=0; i<width; i++){
1364
#ifdef lift5
1365
        b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1366
#else
1367
        int r= 3*(b0[i] + b2[i]);
1368
        r+= r>>4;
1369
        r+= r>>8;
1370
        b1[i] -= (r+W_CO)>>W_CS;
1371
#endif
1372
    }
1373
}
1374

    
1375
static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1376
    int i;
1377

    
1378
    for(i=0; i<width; i++){
1379
#ifdef liftS
1380
        b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1381
#else
1382
        b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1383
#endif
1384
    }
1385
}
1386

    
1387
static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1388
    int i;
1389

    
1390
    for(i=0; i<width; i++){
1391
        b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1392
    }
1393
}
1394

    
1395
void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1396
    int i;
1397

    
1398
    for(i=0; i<width; i++){
1399
#ifndef lift5
1400
        int r;
1401
#endif
1402
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1403
#ifdef lift5
1404
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1405
#else
1406
        r= 3*(b2[i] + b4[i]);
1407
        r+= r>>4;
1408
        r+= r>>8;
1409
        b3[i] -= (r+W_CO)>>W_CS;
1410
#endif
1411
#ifdef liftS
1412
        b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1413
#else
1414
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1415
#endif
1416
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1417
    }
1418
}
1419

    
1420
static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1421
    cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1422
    cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1423
    cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1424
    cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1425
    cs->y = -3;
1426
}
1427

    
1428
static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1429
    cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1430
    cs->b1 = buffer + mirror(-3  , height-1)*stride;
1431
    cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1432
    cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1433
    cs->y = -3;
1434
}
1435

    
1436
static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1437
    int y = cs->y;
1438

    
1439
    DWTELEM *b0= cs->b0;
1440
    DWTELEM *b1= cs->b1;
1441
    DWTELEM *b2= cs->b2;
1442
    DWTELEM *b3= cs->b3;
1443
    DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1444
    DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1445

    
1446
{START_TIMER
1447
    if(y>0 && y+4<height){
1448
        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1449
    }else{
1450
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1451
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1452
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1453
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1454
    }
1455
if(width>400){
1456
STOP_TIMER("vertical_compose97i")}}
1457

    
1458
{START_TIMER
1459
        if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1460
        if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1461
if(width>400 && y+0<(unsigned)height){
1462
STOP_TIMER("horizontal_compose97i")}}
1463

    
1464
    cs->b0=b2;
1465
    cs->b1=b3;
1466
    cs->b2=b4;
1467
    cs->b3=b5;
1468
    cs->y += 2;
1469
}
1470

    
1471
static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1472
    int y = cs->y;
1473
    DWTELEM *b0= cs->b0;
1474
    DWTELEM *b1= cs->b1;
1475
    DWTELEM *b2= cs->b2;
1476
    DWTELEM *b3= cs->b3;
1477
    DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1478
    DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1479

    
1480
{START_TIMER
1481
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1482
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1483
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1484
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1485
if(width>400){
1486
STOP_TIMER("vertical_compose97i")}}
1487

    
1488
{START_TIMER
1489
        if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1490
        if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1491
if(width>400 && b0 <= b2){
1492
STOP_TIMER("horizontal_compose97i")}}
1493

    
1494
    cs->b0=b2;
1495
    cs->b1=b3;
1496
    cs->b2=b4;
1497
    cs->b3=b5;
1498
    cs->y += 2;
1499
}
1500

    
1501
static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1502
    dwt_compose_t cs;
1503
    spatial_compose97i_init(&cs, buffer, height, stride);
1504
    while(cs.y <= height)
1505
        spatial_compose97i_dy(&cs, buffer, width, height, stride);
1506
}
1507

    
1508
static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1509
    int level;
1510
    for(level=decomposition_count-1; level>=0; level--){
1511
        switch(type){
1512
        case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1513
        case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1514
        /* not slicified yet */
1515
        case DWT_X: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1516
          av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1517
        }
1518
    }
1519
}
1520

    
1521
static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1522
    int level;
1523
    for(level=decomposition_count-1; level>=0; level--){
1524
        switch(type){
1525
        case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1526
        case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1527
        /* not slicified yet */
1528
        case DWT_X: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1529
        }
1530
    }
1531
}
1532

    
1533
static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1534
    const int support = type==1 ? 3 : 5;
1535
    int level;
1536
    if(type==2) return;
1537

    
1538
    for(level=decomposition_count-1; level>=0; level--){
1539
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1540
            switch(type){
1541
            case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1542
                    break;
1543
            case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1544
                    break;
1545
            case DWT_X: break;
1546
            }
1547
        }
1548
    }
1549
}
1550

    
1551
static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1552
    const int support = type==1 ? 3 : 5;
1553
    int level;
1554
    if(type==2) return;
1555

    
1556
    for(level=decomposition_count-1; level>=0; level--){
1557
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1558
            switch(type){
1559
            case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1560
                    break;
1561
            case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1562
                    break;
1563
            case DWT_X: break;
1564
            }
1565
        }
1566
    }
1567
}
1568

    
1569
static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1570
    if(type==2){
1571
        int level;
1572
        for(level=decomposition_count-1; level>=0; level--)
1573
            spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1574
    }else{
1575
        dwt_compose_t cs[MAX_DECOMPOSITIONS];
1576
        int y;
1577
        ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1578
        for(y=0; y<height; y+=4)
1579
            ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1580
    }
1581
}
1582

    
1583
static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1584
    const int w= b->width;
1585
    const int h= b->height;
1586
    int x, y;
1587

    
1588
    if(1){
1589
        int run=0;
1590
        int runs[w*h];
1591
        int run_index=0;
1592
        int max_index;
1593

    
1594
        for(y=0; y<h; y++){
1595
            for(x=0; x<w; x++){
1596
                int v, p=0;
1597
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1598
                v= src[x + y*stride];
1599

    
1600
                if(y){
1601
                    t= src[x + (y-1)*stride];
1602
                    if(x){
1603
                        lt= src[x - 1 + (y-1)*stride];
1604
                    }
1605
                    if(x + 1 < w){
1606
                        rt= src[x + 1 + (y-1)*stride];
1607
                    }
1608
                }
1609
                if(x){
1610
                    l= src[x - 1 + y*stride];
1611
                    /*if(x > 1){
1612
                        if(orientation==1) ll= src[y + (x-2)*stride];
1613
                        else               ll= src[x - 2 + y*stride];
1614
                    }*/
1615
                }
1616
                if(parent){
1617
                    int px= x>>1;
1618
                    int py= y>>1;
1619
                    if(px<b->parent->width && py<b->parent->height)
1620
                        p= parent[px + py*2*stride];
1621
                }
1622
                if(!(/*ll|*/l|lt|t|rt|p)){
1623
                    if(v){
1624
                        runs[run_index++]= run;
1625
                        run=0;
1626
                    }else{
1627
                        run++;
1628
                    }
1629
                }
1630
            }
1631
        }
1632
        max_index= run_index;
1633
        runs[run_index++]= run;
1634
        run_index=0;
1635
        run= runs[run_index++];
1636

    
1637
        put_symbol2(&s->c, b->state[30], max_index, 0);
1638
        if(run_index <= max_index)
1639
            put_symbol2(&s->c, b->state[1], run, 3);
1640

    
1641
        for(y=0; y<h; y++){
1642
            if(s->c.bytestream_end - s->c.bytestream < w*40){
1643
                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1644
                return -1;
1645
            }
1646
            for(x=0; x<w; x++){
1647
                int v, p=0;
1648
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1649
                v= src[x + y*stride];
1650

    
1651
                if(y){
1652
                    t= src[x + (y-1)*stride];
1653
                    if(x){
1654
                        lt= src[x - 1 + (y-1)*stride];
1655
                    }
1656
                    if(x + 1 < w){
1657
                        rt= src[x + 1 + (y-1)*stride];
1658
                    }
1659
                }
1660
                if(x){
1661
                    l= src[x - 1 + y*stride];
1662
                    /*if(x > 1){
1663
                        if(orientation==1) ll= src[y + (x-2)*stride];
1664
                        else               ll= src[x - 2 + y*stride];
1665
                    }*/
1666
                }
1667
                if(parent){
1668
                    int px= x>>1;
1669
                    int py= y>>1;
1670
                    if(px<b->parent->width && py<b->parent->height)
1671
                        p= parent[px + py*2*stride];
1672
                }
1673
                if(/*ll|*/l|lt|t|rt|p){
1674
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1675

    
1676
                    put_rac(&s->c, &b->state[0][context], !!v);
1677
                }else{
1678
                    if(!run){
1679
                        run= runs[run_index++];
1680

    
1681
                        if(run_index <= max_index)
1682
                            put_symbol2(&s->c, b->state[1], run, 3);
1683
                        assert(v);
1684
                    }else{
1685
                        run--;
1686
                        assert(!v);
1687
                    }
1688
                }
1689
                if(v){
1690
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1691
                    int l2= 2*FFABS(l) + (l<0);
1692
                    int t2= 2*FFABS(t) + (t<0);
1693

    
1694
                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1695
                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1696
                }
1697
            }
1698
        }
1699
    }
1700
    return 0;
1701
}
1702

    
1703
static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1704
//    encode_subband_qtree(s, b, src, parent, stride, orientation);
1705
//    encode_subband_z0run(s, b, src, parent, stride, orientation);
1706
    return encode_subband_c0run(s, b, src, parent, stride, orientation);
1707
//    encode_subband_dzr(s, b, src, parent, stride, orientation);
1708
}
1709

    
1710
static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1711
    const int w= b->width;
1712
    const int h= b->height;
1713
    int x,y;
1714

    
1715
    if(1){
1716
        int run, runs;
1717
        x_and_coeff *xc= b->x_coeff;
1718
        x_and_coeff *prev_xc= NULL;
1719
        x_and_coeff *prev2_xc= xc;
1720
        x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1721
        x_and_coeff *prev_parent_xc= parent_xc;
1722

    
1723
        runs= get_symbol2(&s->c, b->state[30], 0);
1724
        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1725
        else           run= INT_MAX;
1726

    
1727
        for(y=0; y<h; y++){
1728
            int v=0;
1729
            int lt=0, t=0, rt=0;
1730

    
1731
            if(y && prev_xc->x == 0){
1732
                rt= prev_xc->coeff;
1733
            }
1734
            for(x=0; x<w; x++){
1735
                int p=0;
1736
                const int l= v;
1737

    
1738
                lt= t; t= rt;
1739

    
1740
                if(y){
1741
                    if(prev_xc->x <= x)
1742
                        prev_xc++;
1743
                    if(prev_xc->x == x + 1)
1744
                        rt= prev_xc->coeff;
1745
                    else
1746
                        rt=0;
1747
                }
1748
                if(parent_xc){
1749
                    if(x>>1 > parent_xc->x){
1750
                        parent_xc++;
1751
                    }
1752
                    if(x>>1 == parent_xc->x){
1753
                        p= parent_xc->coeff;
1754
                    }
1755
                }
1756
                if(/*ll|*/l|lt|t|rt|p){
1757
                    int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1758

    
1759
                    v=get_rac(&s->c, &b->state[0][context]);
1760
                    if(v){
1761
                        v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1762
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1763

    
1764
                        xc->x=x;
1765
                        (xc++)->coeff= v;
1766
                    }
1767
                }else{
1768
                    if(!run){
1769
                        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1770
                        else           run= INT_MAX;
1771
                        v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1772
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1773

    
1774
                        xc->x=x;
1775
                        (xc++)->coeff= v;
1776
                    }else{
1777
                        int max_run;
1778
                        run--;
1779
                        v=0;
1780

    
1781
                        if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1782
                        else  max_run= FFMIN(run, w-x-1);
1783
                        if(parent_xc)
1784
                            max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1785
                        x+= max_run;
1786
                        run-= max_run;
1787
                    }
1788
                }
1789
            }
1790
            (xc++)->x= w+1; //end marker
1791
            prev_xc= prev2_xc;
1792
            prev2_xc= xc;
1793

    
1794
            if(parent_xc){
1795
                if(y&1){
1796
                    while(parent_xc->x != parent->width+1)
1797
                        parent_xc++;
1798
                    parent_xc++;
1799
                    prev_parent_xc= parent_xc;
1800
                }else{
1801
                    parent_xc= prev_parent_xc;
1802
                }
1803
            }
1804
        }
1805

    
1806
        (xc++)->x= w+1; //end marker
1807
    }
1808
}
1809

    
1810
static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1811
    const int w= b->width;
1812
    int y;
1813
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
1814
    int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1815
    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1816
    int new_index = 0;
1817

    
1818
    START_TIMER
1819

    
1820
    if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1821
        qadd= 0;
1822
        qmul= 1<<QEXPSHIFT;
1823
    }
1824

    
1825
    /* If we are on the second or later slice, restore our index. */
1826
    if (start_y != 0)
1827
        new_index = save_state[0];
1828

    
1829

    
1830
    for(y=start_y; y<h; y++){
1831
        int x = 0;
1832
        int v;
1833
        DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1834
        memset(line, 0, b->width*sizeof(DWTELEM));
1835
        v = b->x_coeff[new_index].coeff;
1836
        x = b->x_coeff[new_index++].x;
1837
        while(x < w)
1838
        {
1839
            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1840
            register int u= -(v&1);
1841
            line[x] = (t^u) - u;
1842

    
1843
            v = b->x_coeff[new_index].coeff;
1844
            x = b->x_coeff[new_index++].x;
1845
        }
1846
    }
1847
    if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1848
        STOP_TIMER("decode_subband")
1849
    }
1850

    
1851
    /* Save our variables for the next slice. */
1852
    save_state[0] = new_index;
1853

    
1854
    return;
1855
}
1856

    
1857
static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1858
    int plane_index, level, orientation;
1859

    
1860
    for(plane_index=0; plane_index<3; plane_index++){
1861
        for(level=0; level<s->spatial_decomposition_count; level++){
1862
            for(orientation=level ? 1:0; orientation<4; orientation++){
1863
                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1864
            }
1865
        }
1866
    }
1867
    memset(s->header_state, MID_STATE, sizeof(s->header_state));
1868
    memset(s->block_state, MID_STATE, sizeof(s->block_state));
1869
}
1870

    
1871
static int alloc_blocks(SnowContext *s){
1872
    int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1873
    int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1874

    
1875
    s->b_width = w;
1876
    s->b_height= h;
1877

    
1878
    s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1879
    return 0;
1880
}
1881

    
1882
static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1883
    uint8_t *bytestream= d->bytestream;
1884
    uint8_t *bytestream_start= d->bytestream_start;
1885
    *d= *s;
1886
    d->bytestream= bytestream;
1887
    d->bytestream_start= bytestream_start;
1888
}
1889

    
1890
//near copy & paste from dsputil, FIXME
1891
static int pix_sum(uint8_t * pix, int line_size, int w)
1892
{
1893
    int s, i, j;
1894

    
1895
    s = 0;
1896
    for (i = 0; i < w; i++) {
1897
        for (j = 0; j < w; j++) {
1898
            s += pix[0];
1899
            pix ++;
1900
        }
1901
        pix += line_size - w;
1902
    }
1903
    return s;
1904
}
1905

    
1906
//near copy & paste from dsputil, FIXME
1907
static int pix_norm1(uint8_t * pix, int line_size, int w)
1908
{
1909
    int s, i, j;
1910
    uint32_t *sq = ff_squareTbl + 256;
1911

    
1912
    s = 0;
1913
    for (i = 0; i < w; i++) {
1914
        for (j = 0; j < w; j ++) {
1915
            s += sq[pix[0]];
1916
            pix ++;
1917
        }
1918
        pix += line_size - w;
1919
    }
1920
    return s;
1921
}
1922

    
1923
static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1924
    const int w= s->b_width << s->block_max_depth;
1925
    const int rem_depth= s->block_max_depth - level;
1926
    const int index= (x + y*w) << rem_depth;
1927
    const int block_w= 1<<rem_depth;
1928
    BlockNode block;
1929
    int i,j;
1930

    
1931
    block.color[0]= l;
1932
    block.color[1]= cb;
1933
    block.color[2]= cr;
1934
    block.mx= mx;
1935
    block.my= my;
1936
    block.ref= ref;
1937
    block.type= type;
1938
    block.level= level;
1939

    
1940
    for(j=0; j<block_w; j++){
1941
        for(i=0; i<block_w; i++){
1942
            s->block[index + i + j*w]= block;
1943
        }
1944
    }
1945
}
1946

    
1947
static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1948
    const int offset[3]= {
1949
          y*c->  stride + x,
1950
        ((y*c->uvstride + x)>>1),
1951
        ((y*c->uvstride + x)>>1),
1952
    };
1953
    int i;
1954
    for(i=0; i<3; i++){
1955
        c->src[0][i]= src [i];
1956
        c->ref[0][i]= ref [i] + offset[i];
1957
    }
1958
    assert(!ref_index);
1959
}
1960

    
1961
static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1962
                           BlockNode *left, BlockNode *top, BlockNode *tr){
1963
    if(s->ref_frames == 1){
1964
        *mx = mid_pred(left->mx, top->mx, tr->mx);
1965
        *my = mid_pred(left->my, top->my, tr->my);
1966
    }else{
1967
        const int *scale = scale_mv_ref[ref];
1968
        *mx = mid_pred(left->mx * scale[left->ref] + 128 >>8,
1969
                       top ->mx * scale[top ->ref] + 128 >>8,
1970
                       tr  ->mx * scale[tr  ->ref] + 128 >>8);
1971
        *my = mid_pred(left->my * scale[left->ref] + 128 >>8,
1972
                       top ->my * scale[top ->ref] + 128 >>8,
1973
                       tr  ->my * scale[tr  ->ref] + 128 >>8);
1974
    }
1975
}
1976

    
1977
//FIXME copy&paste
1978
#define P_LEFT P[1]
1979
#define P_TOP P[2]
1980
#define P_TOPRIGHT P[3]
1981
#define P_MEDIAN P[4]
1982
#define P_MV1 P[9]
1983
#define FLAG_QPEL   1 //must be 1
1984

    
1985
static int encode_q_branch(SnowContext *s, int level, int x, int y){
1986
    uint8_t p_buffer[1024];
1987
    uint8_t i_buffer[1024];
1988
    uint8_t p_state[sizeof(s->block_state)];
1989
    uint8_t i_state[sizeof(s->block_state)];
1990
    RangeCoder pc, ic;
1991
    uint8_t *pbbak= s->c.bytestream;
1992
    uint8_t *pbbak_start= s->c.bytestream_start;
1993
    int score, score2, iscore, i_len, p_len, block_s, sum;
1994
    const int w= s->b_width  << s->block_max_depth;
1995
    const int h= s->b_height << s->block_max_depth;
1996
    const int rem_depth= s->block_max_depth - level;
1997
    const int index= (x + y*w) << rem_depth;
1998
    const int block_w= 1<<(LOG2_MB_SIZE - level);
1999
    int trx= (x+1)<<rem_depth;
2000
    int try= (y+1)<<rem_depth;
2001
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2002
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
2003
    BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
2004
    BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2005
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2006
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2007
    int pl = left->color[0];
2008
    int pcb= left->color[1];
2009
    int pcr= left->color[2];
2010
    int pmx, pmy;
2011
    int mx=0, my=0;
2012
    int l,cr,cb;
2013
    const int stride= s->current_picture.linesize[0];
2014
    const int uvstride= s->current_picture.linesize[1];
2015
    uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2016
                                s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2017
                                s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2018
    int P[10][2];
2019
    int16_t last_mv[3][2];
2020
    int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2021
    const int shift= 1+qpel;
2022
    MotionEstContext *c= &s->m.me;
2023
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2024
    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2025
    int my_context= av_log2(2*FFABS(left->my - top->my));
2026
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2027
    int ref, best_ref, ref_score, ref_mx, ref_my;
2028

    
2029
    assert(sizeof(s->block_state) >= 256);
2030
    if(s->keyframe){
2031
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2032
        return 0;
2033
    }
2034

    
2035
//    clip predictors / edge ?
2036

    
2037
    P_LEFT[0]= left->mx;
2038
    P_LEFT[1]= left->my;
2039
    P_TOP [0]= top->mx;
2040
    P_TOP [1]= top->my;
2041
    P_TOPRIGHT[0]= tr->mx;
2042
    P_TOPRIGHT[1]= tr->my;
2043

    
2044
    last_mv[0][0]= s->block[index].mx;
2045
    last_mv[0][1]= s->block[index].my;
2046
    last_mv[1][0]= right->mx;
2047
    last_mv[1][1]= right->my;
2048
    last_mv[2][0]= bottom->mx;
2049
    last_mv[2][1]= bottom->my;
2050

    
2051
    s->m.mb_stride=2;
2052
    s->m.mb_x=
2053
    s->m.mb_y= 0;
2054
    s->m.me.skip= 0;
2055

    
2056
    assert(s->m.me.  stride ==   stride);
2057
    assert(s->m.me.uvstride == uvstride);
2058

    
2059
    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2060
    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2061
    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2062
    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2063

    
2064
    c->xmin = - x*block_w - 16+2;
2065
    c->ymin = - y*block_w - 16+2;
2066
    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2067
    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2068

    
2069
    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2070
    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2071
    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2072
    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2073
    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2074
    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2075
    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2076

    
2077
    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2078
    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2079

    
2080
    if (!y) {
2081
        c->pred_x= P_LEFT[0];
2082
        c->pred_y= P_LEFT[1];
2083
    } else {
2084
        c->pred_x = P_MEDIAN[0];
2085
        c->pred_y = P_MEDIAN[1];
2086
    }
2087

    
2088
    score= INT_MAX;
2089
    best_ref= 0;
2090
    for(ref=0; ref<s->ref_frames; ref++){
2091
        init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2092

    
2093
        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2094
                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2095

    
2096
        assert(ref_mx >= c->xmin);
2097
        assert(ref_mx <= c->xmax);
2098
        assert(ref_my >= c->ymin);
2099
        assert(ref_my <= c->ymax);
2100

    
2101
        ref_score= s->m.me.sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2102
        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2103
        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2104
        if(s->ref_mvs[ref]){
2105
            s->ref_mvs[ref][index][0]= ref_mx;
2106
            s->ref_mvs[ref][index][1]= ref_my;
2107
            s->ref_scores[ref][index]= ref_score;
2108
        }
2109
        if(score > ref_score){
2110
            score= ref_score;
2111
            best_ref= ref;
2112
            mx= ref_mx;
2113
            my= ref_my;
2114
        }
2115
    }
2116
    //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
2117

    
2118
  //  subpel search
2119
    pc= s->c;
2120
    pc.bytestream_start=
2121
    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2122
    memcpy(p_state, s->block_state, sizeof(s->block_state));
2123

    
2124
    if(level!=s->block_max_depth)
2125
        put_rac(&pc, &p_state[4 + s_context], 1);
2126
    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2127
    if(s->ref_frames > 1)
2128
        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2129
    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
2130
    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2131
    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2132
    p_len= pc.bytestream - pc.bytestream_start;
2133
    score += (s->lambda2*(p_len*8
2134
              + (pc.outstanding_count - s->c.outstanding_count)*8
2135
              + (-av_log2(pc.range)    + av_log2(s->c.range))
2136
             ))>>FF_LAMBDA_SHIFT;
2137

    
2138
    block_s= block_w*block_w;
2139
    sum = pix_sum(current_data[0], stride, block_w);
2140
    l= (sum + block_s/2)/block_s;
2141
    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2142

    
2143
    block_s= block_w*block_w>>2;
2144
    sum = pix_sum(current_data[1], uvstride, block_w>>1);
2145
    cb= (sum + block_s/2)/block_s;
2146
//    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2147
    sum = pix_sum(current_data[2], uvstride, block_w>>1);
2148
    cr= (sum + block_s/2)/block_s;
2149
//    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2150

    
2151
    ic= s->c;
2152
    ic.bytestream_start=
2153
    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2154
    memcpy(i_state, s->block_state, sizeof(s->block_state));
2155
    if(level!=s->block_max_depth)
2156
        put_rac(&ic, &i_state[4 + s_context], 1);
2157
    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2158
    put_symbol(&ic, &i_state[32],  l-pl , 1);
2159
    put_symbol(&ic, &i_state[64], cb-pcb, 1);
2160
    put_symbol(&ic, &i_state[96], cr-pcr, 1);
2161
    i_len= ic.bytestream - ic.bytestream_start;
2162
    iscore += (s->lambda2*(i_len*8
2163
              + (ic.outstanding_count - s->c.outstanding_count)*8
2164
              + (-av_log2(ic.range)    + av_log2(s->c.range))
2165
             ))>>FF_LAMBDA_SHIFT;
2166

    
2167
//    assert(score==256*256*256*64-1);
2168
    assert(iscore < 255*255*256 + s->lambda2*10);
2169
    assert(iscore >= 0);
2170
    assert(l>=0 && l<=255);
2171
    assert(pl>=0 && pl<=255);
2172

    
2173
    if(level==0){
2174
        int varc= iscore >> 8;
2175
        int vard= score >> 8;
2176
        if (vard <= 64 || vard < varc)
2177
            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2178
        else
2179
            c->scene_change_score+= s->m.qscale;
2180
    }
2181

    
2182
    if(level!=s->block_max_depth){
2183
        put_rac(&s->c, &s->block_state[4 + s_context], 0);
2184
        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2185
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2186
        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2187
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2188
        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2189

    
2190
        if(score2 < score && score2 < iscore)
2191
            return score2;
2192
    }
2193

    
2194
    if(iscore < score){
2195
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2196
        memcpy(pbbak, i_buffer, i_len);
2197
        s->c= ic;
2198
        s->c.bytestream_start= pbbak_start;
2199
        s->c.bytestream= pbbak + i_len;
2200
        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2201
        memcpy(s->block_state, i_state, sizeof(s->block_state));
2202
        return iscore;
2203
    }else{
2204
        memcpy(pbbak, p_buffer, p_len);
2205
        s->c= pc;
2206
        s->c.bytestream_start= pbbak_start;
2207
        s->c.bytestream= pbbak + p_len;
2208
        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2209
        memcpy(s->block_state, p_state, sizeof(s->block_state));
2210
        return score;
2211
    }
2212
}
2213

    
2214
static av_always_inline int same_block(BlockNode *a, BlockNode *b){
2215
    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2216
        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2217
    }else{
2218
        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2219
    }
2220
}
2221

    
2222
static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2223
    const int w= s->b_width  << s->block_max_depth;
2224
    const int rem_depth= s->block_max_depth - level;
2225
    const int index= (x + y*w) << rem_depth;
2226
    int trx= (x+1)<<rem_depth;
2227
    BlockNode *b= &s->block[index];
2228
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2229
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
2230
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2231
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2232
    int pl = left->color[0];
2233
    int pcb= left->color[1];
2234
    int pcr= left->color[2];
2235
    int pmx, pmy;
2236
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2237
    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2238
    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2239
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2240

    
2241
    if(s->keyframe){
2242
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2243
        return;
2244
    }
2245

    
2246
    if(level!=s->block_max_depth){
2247
        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2248
            put_rac(&s->c, &s->block_state[4 + s_context], 1);
2249
        }else{
2250
            put_rac(&s->c, &s->block_state[4 + s_context], 0);
2251
            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2252
            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2253
            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2254
            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2255
            return;
2256
        }
2257
    }
2258
    if(b->type & BLOCK_INTRA){
2259
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2260
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2261
        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2262
        put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2263
        put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2264
        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2265
    }else{
2266
        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2267
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2268
        if(s->ref_frames > 1)
2269
            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2270
        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2271
        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2272
        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2273
    }
2274
}
2275

    
2276
static void decode_q_branch(SnowContext *s, int level, int x, int y){
2277
    const int w= s->b_width << s->block_max_depth;
2278
    const int rem_depth= s->block_max_depth - level;
2279
    const int index= (x + y*w) << rem_depth;
2280
    int trx= (x+1)<<rem_depth;
2281
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2282
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
2283
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2284
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2285
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2286

    
2287
    if(s->keyframe){
2288
        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2289
        return;
2290
    }
2291

    
2292
    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2293
        int type;
2294
        int l = left->color[0];
2295
        int cb= left->color[1];
2296
        int cr= left->color[2];
2297
        int mx= mid_pred(left->mx, top->mx, tr->mx);
2298
        int my= mid_pred(left->my, top->my, tr->my);
2299
        int ref = 0;
2300
        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2301
        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2302
        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2303

    
2304
        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2305

    
2306
        if(type){
2307
            pred_mv(s, &mx, &my, 0, left, top, tr);
2308
            l += get_symbol(&s->c, &s->block_state[32], 1);
2309
            cb+= get_symbol(&s->c, &s->block_state[64], 1);
2310
            cr+= get_symbol(&s->c, &s->block_state[96], 1);
2311
        }else{
2312
            if(s->ref_frames > 1)
2313
                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2314
            pred_mv(s, &mx, &my, ref, left, top, tr);
2315
            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2316
            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2317
        }
2318
        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2319
    }else{
2320
        decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2321
        decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2322
        decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2323
        decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2324
    }
2325
}
2326

    
2327
static void encode_blocks(SnowContext *s, int search){
2328
    int x, y;
2329
    int w= s->b_width;
2330
    int h= s->b_height;
2331

    
2332
    if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2333
        iterative_me(s);
2334

    
2335
    for(y=0; y<h; y++){
2336
        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2337
            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2338
            return;
2339
        }
2340
        for(x=0; x<w; x++){
2341
            if(s->avctx->me_method == ME_ITER || !search)
2342
                encode_q_branch2(s, 0, x, y);
2343
            else
2344
                encode_q_branch (s, 0, x, y);
2345
        }
2346
    }
2347
}
2348

    
2349
static void decode_blocks(SnowContext *s){
2350
    int x, y;
2351
    int w= s->b_width;
2352
    int h= s->b_height;
2353

    
2354
    for(y=0; y<h; y++){
2355
        for(x=0; x<w; x++){
2356
            decode_q_branch(s, 0, x, y);
2357
        }
2358
    }
2359
}
2360

    
2361
static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2362
    int x, y;
2363
START_TIMER
2364
    for(y=0; y < b_h+5; y++){
2365
        for(x=0; x < b_w; x++){
2366
            int a0= src[x    ];
2367
            int a1= src[x + 1];
2368
            int a2= src[x + 2];
2369
            int a3= src[x + 3];
2370
            int a4= src[x + 4];
2371
            int a5= src[x + 5];
2372
//            int am= 9*(a1+a2) - (a0+a3);
2373
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2374
//            int am= 18*(a2+a3) - 2*(a1+a4);
2375
//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2376
//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2377

    
2378
//            if(b_w==16) am= 8*(a1+a2);
2379

    
2380
            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2381
            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2382

    
2383
            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2384
            if(am&(~255)) am= ~(am>>31);
2385

    
2386
            tmp[x] = am;
2387

    
2388
/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2389
            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2390
            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2391
            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2392
        }
2393
        tmp += stride;
2394
        src += stride;
2395
    }
2396
    tmp -= (b_h+5)*stride;
2397

    
2398
    for(y=0; y < b_h; y++){
2399
        for(x=0; x < b_w; x++){
2400
            int a0= tmp[x + 0*stride];
2401
            int a1= tmp[x + 1*stride];
2402
            int a2= tmp[x + 2*stride];
2403
            int a3= tmp[x + 3*stride];
2404
            int a4= tmp[x + 4*stride];
2405
            int a5= tmp[x + 5*stride];
2406
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2407
//            int am= 18*(a2+a3) - 2*(a1+a4);
2408
/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2409
            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2410

    
2411
//            if(b_w==16) am= 8*(a1+a2);
2412

    
2413
            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2414
            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2415

    
2416
            if(am&(~255)) am= ~(am>>31);
2417

    
2418
            dst[x] = am;
2419
/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2420
            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2421
            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2422
            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2423
        }
2424
        dst += stride;
2425
        tmp += stride;
2426
    }
2427
STOP_TIMER("mc_block")
2428
}
2429

    
2430
#define mca(dx,dy,b_w)\
2431
static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2432
    uint8_t tmp[stride*(b_w+5)];\
2433
    assert(h==b_w);\
2434
    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2435
}
2436

    
2437
mca( 0, 0,16)
2438
mca( 8, 0,16)
2439
mca( 0, 8,16)
2440
mca( 8, 8,16)
2441
mca( 0, 0,8)
2442
mca( 8, 0,8)
2443
mca( 0, 8,8)
2444
mca( 8, 8,8)
2445

    
2446
static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2447
    if(block->type & BLOCK_INTRA){
2448
        int x, y;
2449
        const int color = block->color[plane_index];
2450
        const int color4= color*0x01010101;
2451
        if(b_w==32){
2452
            for(y=0; y < b_h; y++){
2453
                *(uint32_t*)&dst[0 + y*stride]= color4;
2454
                *(uint32_t*)&dst[4 + y*stride]= color4;
2455
                *(uint32_t*)&dst[8 + y*stride]= color4;
2456
                *(uint32_t*)&dst[12+ y*stride]= color4;
2457
                *(uint32_t*)&dst[16+ y*stride]= color4;
2458
                *(uint32_t*)&dst[20+ y*stride]= color4;
2459
                *(uint32_t*)&dst[24+ y*stride]= color4;
2460
                *(uint32_t*)&dst[28+ y*stride]= color4;
2461
            }
2462
        }else if(b_w==16){
2463
            for(y=0; y < b_h; y++){
2464
                *(uint32_t*)&dst[0 + y*stride]= color4;
2465
                *(uint32_t*)&dst[4 + y*stride]= color4;
2466
                *(uint32_t*)&dst[8 + y*stride]= color4;
2467
                *(uint32_t*)&dst[12+ y*stride]= color4;
2468
            }
2469
        }else if(b_w==8){
2470
            for(y=0; y < b_h; y++){
2471
                *(uint32_t*)&dst[0 + y*stride]= color4;
2472
                *(uint32_t*)&dst[4 + y*stride]= color4;
2473
            }
2474
        }else if(b_w==4){
2475
            for(y=0; y < b_h; y++){
2476
                *(uint32_t*)&dst[0 + y*stride]= color4;
2477
            }
2478
        }else{
2479
            for(y=0; y < b_h; y++){
2480
                for(x=0; x < b_w; x++){
2481
                    dst[x + y*stride]= color;
2482
                }
2483
            }
2484
        }
2485
    }else{
2486
        uint8_t *src= s->last_picture[block->ref].data[plane_index];
2487
        const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2488
        int mx= block->mx*scale;
2489
        int my= block->my*scale;
2490
        const int dx= mx&15;
2491
        const int dy= my&15;
2492
        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2493
        sx += (mx>>4) - 2;
2494
        sy += (my>>4) - 2;
2495
        src += sx + sy*stride;
2496
        if(   (unsigned)sx >= w - b_w - 4
2497
           || (unsigned)sy >= h - b_h - 4){
2498
            ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2499
            src= tmp + MB_SIZE;
2500
        }
2501
//        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2502
//        assert(!(b_w&(b_w-1)));
2503
        assert(b_w>1 && b_h>1);
2504
        assert(tab_index>=0 && tab_index<4 || b_w==32);
2505
        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2506
            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2507
        else if(b_w==32){
2508
            int y;
2509
            for(y=0; y<b_h; y+=16){
2510
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2511
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2512
            }
2513
        }else if(b_w==b_h)
2514
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2515
        else if(b_w==2*b_h){
2516
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2517
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2518
        }else{
2519
            assert(2*b_w==b_h);
2520
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2521
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2522
        }
2523
    }
2524
}
2525

    
2526
void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2527
                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2528
    int y, x;
2529
    DWTELEM * dst;
2530
    for(y=0; y<b_h; y++){
2531
        //FIXME ugly missue of obmc_stride
2532
        uint8_t *obmc1= obmc + y*obmc_stride;
2533
        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2534
        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2535
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2536
        dst = slice_buffer_get_line(sb, src_y + y);
2537
        for(x=0; x<b_w; x++){
2538
            int v=   obmc1[x] * block[3][x + y*src_stride]
2539
                    +obmc2[x] * block[2][x + y*src_stride]
2540
                    +obmc3[x] * block[1][x + y*src_stride]
2541
                    +obmc4[x] * block[0][x + y*src_stride];
2542

    
2543
            v <<= 8 - LOG2_OBMC_MAX;
2544
            if(FRAC_BITS != 8){
2545
                v += 1<<(7 - FRAC_BITS);
2546
                v >>= 8 - FRAC_BITS;
2547
            }
2548
            if(add){
2549
                v += dst[x + src_x];
2550
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2551
                if(v&(~255)) v= ~(v>>31);
2552
                dst8[x + y*src_stride] = v;
2553
            }else{
2554
                dst[x + src_x] -= v;
2555
            }
2556
        }
2557
    }
2558
}
2559

    
2560
//FIXME name clenup (b_w, block_w, b_width stuff)
2561
static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2562
    const int b_width = s->b_width  << s->block_max_depth;
2563
    const int b_height= s->b_height << s->block_max_depth;
2564
    const int b_stride= b_width;
2565
    BlockNode *lt= &s->block[b_x + b_y*b_stride];
2566
    BlockNode *rt= lt+1;
2567
    BlockNode *lb= lt+b_stride;
2568
    BlockNode *rb= lb+1;
2569
    uint8_t *block[4];
2570
    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2571
    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2572
    uint8_t *ptmp;
2573
    int x,y;
2574

    
2575
    if(b_x<0){
2576
        lt= rt;
2577
        lb= rb;
2578
    }else if(b_x + 1 >= b_width){
2579
        rt= lt;
2580
        rb= lb;
2581
    }
2582
    if(b_y<0){
2583
        lt= lb;
2584
        rt= rb;
2585
    }else if(b_y + 1 >= b_height){
2586
        lb= lt;
2587
        rb= rt;
2588
    }
2589

    
2590
    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2591
        obmc -= src_x;
2592
        b_w += src_x;
2593
        if(!sliced && !offset_dst)
2594
            dst -= src_x;
2595
        src_x=0;
2596
    }else if(src_x + b_w > w){
2597
        b_w = w - src_x;
2598
    }
2599
    if(src_y<0){
2600
        obmc -= src_y*obmc_stride;
2601
        b_h += src_y;
2602
        if(!sliced && !offset_dst)
2603
            dst -= src_y*dst_stride;
2604
        src_y=0;
2605
    }else if(src_y + b_h> h){
2606
        b_h = h - src_y;
2607
    }
2608

    
2609
    if(b_w<=0 || b_h<=0) return;
2610

    
2611
assert(src_stride > 2*MB_SIZE + 5);
2612
    if(!sliced && offset_dst)
2613
        dst += src_x + src_y*dst_stride;
2614
    dst8+= src_x + src_y*src_stride;
2615
//    src += src_x + src_y*src_stride;
2616

    
2617
    ptmp= tmp + 3*tmp_step;
2618
    block[0]= ptmp;
2619
    ptmp+=tmp_step;
2620
    pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2621

    
2622
    if(same_block(lt, rt)){
2623
        block[1]= block[0];
2624
    }else{
2625
        block[1]= ptmp;
2626
        ptmp+=tmp_step;
2627
        pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2628
    }
2629

    
2630
    if(same_block(lt, lb)){
2631
        block[2]= block[0];
2632
    }else if(same_block(rt, lb)){
2633
        block[2]= block[1];
2634
    }else{
2635
        block[2]= ptmp;
2636
        ptmp+=tmp_step;
2637
        pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2638
    }
2639

    
2640
    if(same_block(lt, rb) ){
2641
        block[3]= block[0];
2642
    }else if(same_block(rt, rb)){
2643
        block[3]= block[1];
2644
    }else if(same_block(lb, rb)){
2645
        block[3]= block[2];
2646
    }else{
2647
        block[3]= ptmp;
2648
        pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2649
    }
2650
#if 0
2651
    for(y=0; y<b_h; y++){
2652
        for(x=0; x<b_w; x++){
2653
            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2654
            if(add) dst[x + y*dst_stride] += v;
2655
            else    dst[x + y*dst_stride] -= v;
2656
        }
2657
    }
2658
    for(y=0; y<b_h; y++){
2659
        uint8_t *obmc2= obmc + (obmc_stride>>1);
2660
        for(x=0; x<b_w; x++){
2661
            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2662
            if(add) dst[x + y*dst_stride] += v;
2663
            else    dst[x + y*dst_stride] -= v;
2664
        }
2665
    }
2666
    for(y=0; y<b_h; y++){
2667
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2668
        for(x=0; x<b_w; x++){
2669
            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2670
            if(add) dst[x + y*dst_stride] += v;
2671
            else    dst[x + y*dst_stride] -= v;
2672
        }
2673
    }
2674
    for(y=0; y<b_h; y++){
2675
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2676
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2677
        for(x=0; x<b_w; x++){
2678
            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2679
            if(add) dst[x + y*dst_stride] += v;
2680
            else    dst[x + y*dst_stride] -= v;
2681
        }
2682
    }
2683
#else
2684
    if(sliced){
2685
        START_TIMER
2686

    
2687
        s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2688
        STOP_TIMER("inner_add_yblock")
2689
    }else
2690
    for(y=0; y<b_h; y++){
2691
        //FIXME ugly missue of obmc_stride
2692
        uint8_t *obmc1= obmc + y*obmc_stride;
2693
        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2694
        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2695
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2696
        for(x=0; x<b_w; x++){
2697
            int v=   obmc1[x] * block[3][x + y*src_stride]
2698
                    +obmc2[x] * block[2][x + y*src_stride]
2699
                    +obmc3[x] * block[1][x + y*src_stride]
2700
                    +obmc4[x] * block[0][x + y*src_stride];
2701

    
2702
            v <<= 8 - LOG2_OBMC_MAX;
2703
            if(FRAC_BITS != 8){
2704
                v += 1<<(7 - FRAC_BITS);
2705
                v >>= 8 - FRAC_BITS;
2706
            }
2707
            if(add){
2708
                v += dst[x + y*dst_stride];
2709
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2710
                if(v&(~255)) v= ~(v>>31);
2711
                dst8[x + y*src_stride] = v;
2712
            }else{
2713
                dst[x + y*dst_stride] -= v;
2714
            }
2715
        }
2716
    }
2717
#endif
2718
}
2719

    
2720
static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2721
    Plane *p= &s->plane[plane_index];
2722
    const int mb_w= s->b_width  << s->block_max_depth;
2723
    const int mb_h= s->b_height << s->block_max_depth;
2724
    int x, y, mb_x;
2725
    int block_size = MB_SIZE >> s->block_max_depth;
2726
    int block_w    = plane_index ? block_size/2 : block_size;
2727
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2728
    int obmc_stride= plane_index ? block_size : 2*block_size;
2729
    int ref_stride= s->current_picture.linesize[plane_index];
2730
    uint8_t *dst8= s->current_picture.data[plane_index];
2731
    int w= p->width;
2732
    int h= p->height;
2733
    START_TIMER
2734

    
2735
    if(s->keyframe || (s->avctx->debug&512)){
2736
        if(mb_y==mb_h)
2737
            return;
2738

    
2739
        if(add){
2740
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2741
            {
2742
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2743
                DWTELEM * line = sb->line[y];
2744
                for(x=0; x<w; x++)
2745
                {
2746
//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2747
                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2748
                    v >>= FRAC_BITS;
2749
                    if(v&(~255)) v= ~(v>>31);
2750
                    dst8[x + y*ref_stride]= v;
2751
                }
2752
            }
2753
        }else{
2754
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2755
            {
2756
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2757
                DWTELEM * line = sb->line[y];
2758
                for(x=0; x<w; x++)
2759
                {
2760
                    line[x] -= 128 << FRAC_BITS;
2761
//                    buf[x + y*w]-= 128<<FRAC_BITS;
2762
                }
2763
            }
2764
        }
2765

    
2766
        return;
2767
    }
2768

    
2769
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2770
            START_TIMER
2771

    
2772
            add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2773
                       block_w*mb_x - block_w/2,
2774
                       block_w*mb_y - block_w/2,
2775
                       block_w, block_w,
2776
                       w, h,
2777
                       w, ref_stride, obmc_stride,
2778
                       mb_x - 1, mb_y - 1,
2779
                       add, 0, plane_index);
2780

    
2781
            STOP_TIMER("add_yblock")
2782
        }
2783

    
2784
    STOP_TIMER("predict_slice")
2785
}
2786

    
2787
static av_always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2788
    Plane *p= &s->plane[plane_index];
2789
    const int mb_w= s->b_width  << s->block_max_depth;
2790
    const int mb_h= s->b_height << s->block_max_depth;
2791
    int x, y, mb_x;
2792
    int block_size = MB_SIZE >> s->block_max_depth;
2793
    int block_w    = plane_index ? block_size/2 : block_size;
2794
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2795
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2796
    int ref_stride= s->current_picture.linesize[plane_index];
2797
    uint8_t *dst8= s->current_picture.data[plane_index];
2798
    int w= p->width;
2799
    int h= p->height;
2800
    START_TIMER
2801

    
2802
    if(s->keyframe || (s->avctx->debug&512)){
2803
        if(mb_y==mb_h)
2804
            return;
2805

    
2806
        if(add){
2807
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2808
                for(x=0; x<w; x++){
2809
                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2810
                    v >>= FRAC_BITS;
2811
                    if(v&(~255)) v= ~(v>>31);
2812
                    dst8[x + y*ref_stride]= v;
2813
                }
2814
            }
2815
        }else{
2816
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2817
                for(x=0; x<w; x++){
2818
                    buf[x + y*w]-= 128<<FRAC_BITS;
2819
                }
2820
            }
2821
        }
2822

    
2823
        return;
2824
    }
2825

    
2826
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2827
            START_TIMER
2828

    
2829
            add_yblock(s, 0, NULL, buf, dst8, obmc,
2830
                       block_w*mb_x - block_w/2,
2831
                       block_w*mb_y - block_w/2,
2832
                       block_w, block_w,
2833
                       w, h,
2834
                       w, ref_stride, obmc_stride,
2835
                       mb_x - 1, mb_y - 1,
2836
                       add, 1, plane_index);
2837

    
2838
            STOP_TIMER("add_yblock")
2839
        }
2840

    
2841
    STOP_TIMER("predict_slice")
2842
}
2843

    
2844
static av_always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2845
    const int mb_h= s->b_height << s->block_max_depth;
2846
    int mb_y;
2847
    for(mb_y=0; mb_y<=mb_h; mb_y++)
2848
        predict_slice(s, buf, plane_index, add, mb_y);
2849
}
2850

    
2851
static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2852
    int i, x2, y2;
2853
    Plane *p= &s->plane[plane_index];
2854
    const int block_size = MB_SIZE >> s->block_max_depth;
2855
    const int block_w    = plane_index ? block_size/2 : block_size;
2856
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2857
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2858
    const int ref_stride= s->current_picture.linesize[plane_index];
2859
    uint8_t *src= s-> input_picture.data[plane_index];
2860
    DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2861
    const int b_stride = s->b_width << s->block_max_depth;
2862
    const int w= p->width;
2863
    const int h= p->height;
2864
    int index= mb_x + mb_y*b_stride;
2865
    BlockNode *b= &s->block[index];
2866
    BlockNode backup= *b;
2867
    int ab=0;
2868
    int aa=0;
2869

    
2870
    b->type|= BLOCK_INTRA;
2871
    b->color[plane_index]= 0;
2872
    memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2873

    
2874
    for(i=0; i<4; i++){
2875
        int mb_x2= mb_x + (i &1) - 1;
2876
        int mb_y2= mb_y + (i>>1) - 1;
2877
        int x= block_w*mb_x2 + block_w/2;
2878
        int y= block_w*mb_y2 + block_w/2;
2879

    
2880
        add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2881
                    x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2882

    
2883
        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2884
            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2885
                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2886
                int obmc_v= obmc[index];
2887
                int d;
2888
                if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2889
                if(x<0) obmc_v += obmc[index + block_w];
2890
                if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2891
                if(x+block_w>w) obmc_v += obmc[index - block_w];
2892
                //FIXME precalc this or simplify it somehow else
2893

    
2894
                d = -dst[index] + (1<<(FRAC_BITS-1));
2895
                dst[index] = d;
2896
                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2897
                aa += obmc_v * obmc_v; //FIXME precalclate this
2898
            }
2899
        }
2900
    }
2901
    *b= backup;
2902

    
2903
    return clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
2904
}
2905

    
2906
static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2907
    const int b_stride = s->b_width << s->block_max_depth;
2908
    const int b_height = s->b_height<< s->block_max_depth;
2909
    int index= x + y*b_stride;
2910
    BlockNode *b     = &s->block[index];
2911
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2912
    BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2913
    BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2914
    BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2915
    int dmx, dmy;
2916
//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2917
//  int my_context= av_log2(2*FFABS(left->my - top->my));
2918

    
2919
    if(x<0 || x>=b_stride || y>=b_height)
2920
        return 0;
2921
/*
2922
1            0      0
2923
01X          1-2    1
2924
001XX        3-6    2-3
2925
0001XXX      7-14   4-7
2926
00001XXXX   15-30   8-15
2927
*/
2928
//FIXME try accurate rate
2929
//FIXME intra and inter predictors if surrounding blocks arent the same type
2930
    if(b->type & BLOCK_INTRA){
2931
        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2932
                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
2933
                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
2934
    }else{
2935
        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2936
        dmx-= b->mx;
2937
        dmy-= b->my;
2938
        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2939
                    + av_log2(2*FFABS(dmy))
2940
                    + av_log2(2*b->ref));
2941
    }
2942
}
2943

    
2944
static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2945
    Plane *p= &s->plane[plane_index];
2946
    const int block_size = MB_SIZE >> s->block_max_depth;
2947
    const int block_w    = plane_index ? block_size/2 : block_size;
2948
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2949
    const int ref_stride= s->current_picture.linesize[plane_index];
2950
    uint8_t *dst= s->current_picture.data[plane_index];
2951
    uint8_t *src= s->  input_picture.data[plane_index];
2952
    DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2953
    uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2954
    uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
2955
    const int b_stride = s->b_width << s->block_max_depth;
2956
    const int b_height = s->b_height<< s->block_max_depth;
2957
    const int w= p->width;
2958
    const int h= p->height;
2959
    int distortion;
2960
    int rate= 0;
2961
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2962
    int sx= block_w*mb_x - block_w/2;
2963
    int sy= block_w*mb_y - block_w/2;
2964
    int x0= FFMAX(0,-sx);
2965
    int y0= FFMAX(0,-sy);
2966
    int x1= FFMIN(block_w*2, w-sx);
2967
    int y1= FFMIN(block_w*2, h-sy);
2968
    int i,x,y;
2969

    
2970
    pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2971

    
2972
    for(y=y0; y<y1; y++){
2973
        const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2974
        const DWTELEM *pred1 = pred + y*obmc_stride;
2975
        uint8_t *cur1 = cur + y*ref_stride;
2976
        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2977
        for(x=x0; x<x1; x++){
2978
            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2979
            v = (v + pred1[x]) >> FRAC_BITS;
2980
            if(v&(~255)) v= ~(v>>31);
2981
            dst1[x] = v;
2982
        }
2983
    }
2984

    
2985
    /* copy the regions where obmc[] = (uint8_t)256 */
2986
    if(LOG2_OBMC_MAX == 8
2987
        && (mb_x == 0 || mb_x == b_stride-1)
2988
        && (mb_y == 0 || mb_y == b_height-1)){
2989
        if(mb_x == 0)
2990
            x1 = block_w;
2991
        else
2992
            x0 = block_w;
2993
        if(mb_y == 0)
2994
            y1 = block_w;
2995
        else
2996
            y0 = block_w;
2997
        for(y=y0; y<y1; y++)
2998
            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2999
    }
3000

    
3001
    if(block_w==16){
3002
        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
3003
        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
3004
        /* FIXME cmps overlap but don't cover the wavelet's whole support,
3005
         * so improving the score of one block is not strictly guaranteed to
3006
         * improve the score of the whole frame, so iterative motion est
3007
         * doesn't always converge. */
3008
        if(s->avctx->me_cmp == FF_CMP_W97)
3009
            distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3010
        else if(s->avctx->me_cmp == FF_CMP_W53)
3011
            distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3012
        else{
3013
            distortion = 0;
3014
            for(i=0; i<4; i++){
3015
                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3016
                distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3017
            }
3018
        }
3019
    }else{
3020
        assert(block_w==8);
3021
        distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3022
    }
3023

    
3024
    if(plane_index==0){
3025
        for(i=0; i<4; i++){
3026
/* ..RRr
3027
 * .RXx.
3028
 * rxx..
3029
 */
3030
            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3031
        }
3032
        if(mb_x == b_stride-2)
3033
            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3034
    }
3035
    return distortion + rate*penalty_factor;
3036
}
3037

    
3038
static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3039
    int i, y2;
3040
    Plane *p= &s->plane[plane_index];
3041
    const int block_size = MB_SIZE >> s->block_max_depth;
3042
    const int block_w    = plane_index ? block_size/2 : block_size;
3043
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3044
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3045
    const int ref_stride= s->current_picture.linesize[plane_index];
3046
    uint8_t *dst= s->current_picture.data[plane_index];
3047
    uint8_t *src= s-> input_picture.data[plane_index];
3048
    static const DWTELEM zero_dst[4096]; //FIXME
3049
    const int b_stride = s->b_width << s->block_max_depth;
3050
    const int w= p->width;
3051
    const int h= p->height;
3052
    int distortion= 0;
3053
    int rate= 0;
3054
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3055

    
3056
    for(i=0; i<9; i++){
3057
        int mb_x2= mb_x + (i%3) - 1;
3058
        int mb_y2= mb_y + (i/3) - 1;
3059
        int x= block_w*mb_x2 + block_w/2;
3060
        int y= block_w*mb_y2 + block_w/2;
3061

    
3062
        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
3063
                   x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3064

    
3065
        //FIXME find a cleaner/simpler way to skip the outside stuff
3066
        for(y2= y; y2<0; y2++)
3067
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3068
        for(y2= h; y2<y+block_w; y2++)
3069
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3070
        if(x<0){
3071
            for(y2= y; y2<y+block_w; y2++)
3072
                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3073
        }
3074
        if(x+block_w > w){
3075
            for(y2= y; y2<y+block_w; y2++)
3076
                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3077
        }
3078

    
3079
        assert(block_w== 8 || block_w==16);
3080
        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3081
    }
3082

    
3083
    if(plane_index==0){
3084
        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3085
        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3086

    
3087
/* ..RRRr
3088
 * .RXXx.
3089
 * .RXXx.
3090
 * rxxx.
3091
 */
3092
        if(merged)
3093
            rate = get_block_bits(s, mb_x, mb_y, 2);
3094
        for(i=merged?4:0; i<9; i++){
3095
            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3096
            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3097
        }
3098
    }
3099
    return distortion + rate*penalty_factor;
3100
}
3101

    
3102
static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3103
    const int b_stride= s->b_width << s->block_max_depth;
3104
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3105
    BlockNode backup= *block;
3106
    int rd, index, value;
3107

    
3108
    assert(mb_x>=0 && mb_y>=0);
3109
    assert(mb_x<b_stride);
3110

    
3111
    if(intra){
3112
        block->color[0] = p[0];
3113
        block->color[1] = p[1];
3114
        block->color[2] = p[2];
3115
        block->type |= BLOCK_INTRA;
3116
    }else{
3117
        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3118
        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3119
        if(s->me_cache[index] == value)
3120
            return 0;
3121
        s->me_cache[index]= value;
3122

    
3123
        block->mx= p[0];
3124
        block->my= p[1];
3125
        block->type &= ~BLOCK_INTRA;
3126
    }
3127

    
3128
    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3129

    
3130
//FIXME chroma
3131
    if(rd < *best_rd){
3132
        *best_rd= rd;
3133
        return 1;
3134
    }else{
3135
        *block= backup;
3136
        return 0;
3137
    }
3138
}
3139

    
3140
/* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3141
static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3142
    int p[2] = {p0, p1};
3143
    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3144
}
3145

    
3146
static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3147
    const int b_stride= s->b_width << s->block_max_depth;
3148
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3149
    BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3150
    int rd, index, value;
3151

    
3152
    assert(mb_x>=0 && mb_y>=0);
3153
    assert(mb_x<b_stride);
3154
    assert(((mb_x|mb_y)&1) == 0);
3155

    
3156
    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3157
    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3158
    if(s->me_cache[index] == value)
3159
        return 0;
3160
    s->me_cache[index]= value;
3161

    
3162
    block->mx= p0;
3163
    block->my= p1;
3164
    block->ref= ref;
3165
    block->type &= ~BLOCK_INTRA;
3166
    block[1]= block[b_stride]= block[b_stride+1]= *block;
3167

    
3168
    rd= get_4block_rd(s, mb_x, mb_y, 0);
3169

    
3170
//FIXME chroma
3171
    if(rd < *best_rd){
3172
        *best_rd= rd;
3173
        return 1;
3174
    }else{
3175
        block[0]= backup[0];
3176
        block[1]= backup[1];
3177
        block[b_stride]= backup[2];
3178
        block[b_stride+1]= backup[3];
3179
        return 0;
3180
    }
3181
}
3182

    
3183
static void iterative_me(SnowContext *s){
3184
    int pass, mb_x, mb_y;
3185
    const int b_width = s->b_width  << s->block_max_depth;
3186
    const int b_height= s->b_height << s->block_max_depth;
3187
    const int b_stride= b_width;
3188
    int color[3];
3189

    
3190
    {
3191
        RangeCoder r = s->c;
3192
        uint8_t state[sizeof(s->block_state)];
3193
        memcpy(state, s->block_state, sizeof(s->block_state));
3194
        for(mb_y= 0; mb_y<s->b_height; mb_y++)
3195
            for(mb_x= 0; mb_x<s->b_width; mb_x++)
3196
                encode_q_branch(s, 0, mb_x, mb_y);
3197
        s->c = r;
3198
        memcpy(s->block_state, state, sizeof(s->block_state));
3199
    }
3200

    
3201
    for(pass=0; pass<25; pass++){
3202
        int change= 0;
3203

    
3204
        for(mb_y= 0; mb_y<b_height; mb_y++){
3205
            for(mb_x= 0; mb_x<b_width; mb_x++){
3206
                int dia_change, i, j, ref;
3207
                int best_rd= INT_MAX, ref_rd;
3208
                BlockNode backup, ref_b;
3209
                const int index= mb_x + mb_y * b_stride;
3210
                BlockNode *block= &s->block[index];
3211
                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3212
                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3213
                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3214
                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3215
                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3216
                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3217
                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3218
                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3219
                const int b_w= (MB_SIZE >> s->block_max_depth);
3220
                uint8_t obmc_edged[b_w*2][b_w*2];
3221

    
3222
                if(pass && (block->type & BLOCK_OPT))
3223
                    continue;
3224
                block->type |= BLOCK_OPT;
3225

    
3226
                backup= *block;
3227

    
3228
                if(!s->me_cache_generation)
3229
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3230
                s->me_cache_generation += 1<<22;
3231

    
3232
                //FIXME precalc
3233
                {
3234
                    int x, y;
3235
                    memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3236
                    if(mb_x==0)
3237
                        for(y=0; y<b_w*2; y++)
3238
                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3239
                    if(mb_x==b_stride-1)
3240
                        for(y=0; y<b_w*2; y++)
3241
                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3242
                    if(mb_y==0){
3243
                        for(x=0; x<b_w*2; x++)
3244
                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
3245
                        for(y=1; y<b_w; y++)
3246
                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3247
                    }
3248
                    if(mb_y==b_height-1){
3249
                        for(x=0; x<b_w*2; x++)
3250
                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3251
                        for(y=b_w; y<b_w*2-1; y++)
3252
                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3253
                    }
3254
                }
3255

    
3256
                //skip stuff outside the picture
3257
                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3258
                {
3259
                    uint8_t *src= s->  input_picture.data[0];
3260
                    uint8_t *dst= s->current_picture.data[0];
3261
                    const int stride= s->current_picture.linesize[0];
3262
                    const int block_w= MB_SIZE >> s->block_max_depth;
3263
                    const int sx= block_w*mb_x - block_w/2;
3264
                    const int sy= block_w*mb_y - block_w/2;
3265
                    const int w= s->plane[0].width;
3266
                    const int h= s->plane[0].height;
3267
                    int y;
3268

    
3269
                    for(y=sy; y<0; y++)
3270
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3271
                    for(y=h; y<sy+block_w*2; y++)
3272
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3273
                    if(sx<0){
3274
                        for(y=sy; y<sy+block_w*2; y++)
3275
                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3276
                    }
3277
                    if(sx+block_w*2 > w){
3278
                        for(y=sy; y<sy+block_w*2; y++)
3279
                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3280
                    }
3281
                }
3282

    
3283
                // intra(black) = neighbors' contribution to the current block
3284
                for(i=0; i<3; i++)
3285
                    color[i]= get_dc(s, mb_x, mb_y, i);
3286

    
3287
                // get previous score (cant be cached due to OBMC)
3288
                if(pass > 0 && (block->type&BLOCK_INTRA)){
3289
                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
3290
                    check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3291
                }else
3292
                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3293

    
3294
                ref_b= *block;
3295
                ref_rd= best_rd;
3296
                for(ref=0; ref < s->ref_frames; ref++){
3297
                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3298
                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3299
                        continue;
3300
                    block->ref= ref;
3301
                    best_rd= INT_MAX;
3302

    
3303
                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3304
                    check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3305
                    if(tb)
3306
                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3307
                    if(lb)
3308
                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3309
                    if(rb)
3310
                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3311
                    if(bb)
3312
                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3313

    
3314
                    /* fullpel ME */
3315
                    //FIXME avoid subpel interpol / round to nearest integer
3316
                    do{
3317
                        dia_change=0;
3318
                        for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3319
                            for(j=0; j<i; j++){
3320
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3321
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3322
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3323
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3324
                            }
3325
                        }
3326
                    }while(dia_change);
3327
                    /* subpel ME */
3328
                    do{
3329
                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3330
                        dia_change=0;
3331
                        for(i=0; i<8; i++)
3332
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3333
                    }while(dia_change);
3334
                    //FIXME or try the standard 2 pass qpel or similar
3335

    
3336
                    mvr[0][0]= block->mx;
3337
                    mvr[0][1]= block->my;
3338
                    if(ref_rd > best_rd){
3339
                        ref_rd= best_rd;
3340
                        ref_b= *block;
3341
                    }
3342
                }
3343
                best_rd= ref_rd;
3344
                *block= ref_b;
3345
#if 1
3346
                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3347
                //FIXME RD style color selection
3348
#endif
3349
                if(!same_block(block, &backup)){
3350
                    if(tb ) tb ->type &= ~BLOCK_OPT;
3351
                    if(lb ) lb ->type &= ~BLOCK_OPT;
3352
                    if(rb ) rb ->type &= ~BLOCK_OPT;
3353
                    if(bb ) bb ->type &= ~BLOCK_OPT;
3354
                    if(tlb) tlb->type &= ~BLOCK_OPT;
3355
                    if(trb) trb->type &= ~BLOCK_OPT;
3356
                    if(blb) blb->type &= ~BLOCK_OPT;
3357
                    if(brb) brb->type &= ~BLOCK_OPT;
3358
                    change ++;
3359
                }
3360
            }
3361
        }
3362
        av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3363
        if(!change)
3364
            break;
3365
    }
3366

    
3367
    if(s->block_max_depth == 1){
3368
        int change= 0;
3369
        for(mb_y= 0; mb_y<b_height; mb_y+=2){
3370
            for(mb_x= 0; mb_x<b_width; mb_x+=2){
3371
                int i;
3372
                int best_rd, init_rd;
3373
                const int index= mb_x + mb_y * b_stride;
3374
                BlockNode *b[4];
3375

    
3376
                b[0]= &s->block[index];
3377
                b[1]= b[0]+1;
3378
                b[2]= b[0]+b_stride;
3379
                b[3]= b[2]+1;
3380
                if(same_block(b[0], b[1]) &&
3381
                   same_block(b[0], b[2]) &&
3382
                   same_block(b[0], b[3]))
3383
                    continue;
3384

    
3385
                if(!s->me_cache_generation)
3386
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3387
                s->me_cache_generation += 1<<22;
3388

    
3389
                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3390

    
3391
                //FIXME more multiref search?
3392
                check_4block_inter(s, mb_x, mb_y,
3393
                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3394
                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3395

    
3396
                for(i=0; i<4; i++)
3397
                    if(!(b[i]->type&BLOCK_INTRA))
3398
                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3399

    
3400
                if(init_rd != best_rd)
3401
                    change++;
3402
            }
3403
        }
3404
        av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3405
    }
3406
}
3407

    
3408
static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3409
    const int level= b->level;
3410
    const int w= b->width;
3411
    const int h= b->height;
3412
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3413
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3414
    int x,y, thres1, thres2;
3415
//    START_TIMER
3416

    
3417
    if(s->qlog == LOSSLESS_QLOG) return;
3418

    
3419
    bias= bias ? 0 : (3*qmul)>>3;
3420
    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3421
    thres2= 2*thres1;
3422

    
3423
    if(!bias){
3424
        for(y=0; y<h; y++){
3425
            for(x=0; x<w; x++){
3426
                int i= src[x + y*stride];
3427

    
3428
                if((unsigned)(i+thres1) > thres2){
3429
                    if(i>=0){
3430
                        i<<= QEXPSHIFT;
3431
                        i/= qmul; //FIXME optimize
3432
                        src[x + y*stride]=  i;
3433
                    }else{
3434
                        i= -i;
3435
                        i<<= QEXPSHIFT;
3436
                        i/= qmul; //FIXME optimize
3437
                        src[x + y*stride]= -i;
3438
                    }
3439
                }else
3440
                    src[x + y*stride]= 0;
3441
            }
3442
        }
3443
    }else{
3444
        for(y=0; y<h; y++){
3445
            for(x=0; x<w; x++){
3446
                int i= src[x + y*stride];
3447

    
3448
                if((unsigned)(i+thres1) > thres2){
3449
                    if(i>=0){
3450
                        i<<= QEXPSHIFT;
3451
                        i= (i + bias) / qmul; //FIXME optimize
3452
                        src[x + y*stride]=  i;
3453
                    }else{
3454
                        i= -i;
3455
                        i<<= QEXPSHIFT;
3456
                        i= (i + bias) / qmul; //FIXME optimize
3457
                        src[x + y*stride]= -i;
3458
                    }
3459
                }else
3460
                    src[x + y*stride]= 0;
3461
            }
3462
        }
3463
    }
3464
    if(level+1 == s->spatial_decomposition_count){
3465
//        STOP_TIMER("quantize")
3466
    }
3467
}
3468

    
3469
static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3470
    const int w= b->width;
3471
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3472
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3473
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3474
    int x,y;
3475
    START_TIMER
3476

    
3477
    if(s->qlog == LOSSLESS_QLOG) return;
3478

    
3479
    for(y=start_y; y<end_y; y++){
3480
//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3481
        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3482
        for(x=0; x<w; x++){
3483
            int i= line[x];
3484
            if(i<0){
3485
                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3486
            }else if(i>0){
3487
                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3488
            }
3489
        }
3490
    }
3491
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3492
        STOP_TIMER("dquant")
3493
    }
3494
}
3495

    
3496
static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3497
    const int w= b->width;
3498
    const int h= b->height;
3499
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3500
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3501
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3502
    int x,y;
3503
    START_TIMER
3504

    
3505
    if(s->qlog == LOSSLESS_QLOG) return;
3506

    
3507
    for(y=0; y<h; y++){
3508
        for(x=0; x<w; x++){
3509
            int i= src[x + y*stride];
3510
            if(i<0){
3511
                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3512
            }else if(i>0){
3513
                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3514
            }
3515
        }
3516
    }
3517
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3518
        STOP_TIMER("dquant")
3519
    }
3520
}
3521

    
3522
static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3523
    const int w= b->width;
3524
    const int h= b->height;
3525
    int x,y;
3526

    
3527
    for(y=h-1; y>=0; y--){
3528
        for(x=w-1; x>=0; x--){
3529
            int i= x + y*stride;
3530

    
3531
            if(x){
3532
                if(use_median){
3533
                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3534
                    else  src[i] -= src[i - 1];
3535
                }else{
3536
                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3537
                    else  src[i] -= src[i - 1];
3538
                }
3539
            }else{
3540
                if(y) src[i] -= src[i - stride];
3541
            }
3542
        }
3543
    }
3544
}
3545

    
3546
static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3547
    const int w= b->width;
3548
    int x,y;
3549

    
3550
//    START_TIMER
3551

    
3552
    DWTELEM * line;
3553
    DWTELEM * prev;
3554

    
3555
    if (start_y != 0)
3556
        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3557

    
3558
    for(y=start_y; y<end_y; y++){
3559
        prev = line;
3560
//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3561
        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3562
        for(x=0; x<w; x++){
3563
            if(x){
3564
                if(use_median){
3565
                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3566
                    else  line[x] += line[x - 1];
3567
                }else{
3568
                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3569
                    else  line[x] += line[x - 1];
3570
                }
3571
            }else{
3572
                if(y) line[x] += prev[x];
3573
            }
3574
        }
3575
    }
3576

    
3577
//    STOP_TIMER("correlate")
3578
}
3579

    
3580
static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3581
    const int w= b->width;
3582
    const int h= b->height;
3583
    int x,y;
3584

    
3585
    for(y=0; y<h; y++){
3586
        for(x=0; x<w; x++){
3587
            int i= x + y*stride;
3588

    
3589
            if(x){
3590
                if(use_median){
3591
                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3592
                    else  src[i] += src[i - 1];
3593
                }else{
3594
                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3595
                    else  src[i] += src[i - 1];
3596
                }
3597
            }else{
3598
                if(y) src[i] += src[i - stride];
3599
            }
3600
        }
3601
    }
3602
}
3603

    
3604
static void encode_header(SnowContext *s){
3605
    int plane_index, level, orientation;
3606
    uint8_t kstate[32];
3607

    
3608
    memset(kstate, MID_STATE, sizeof(kstate));
3609

    
3610
    put_rac(&s->c, kstate, s->keyframe);
3611
    if(s->keyframe || s->always_reset){
3612
        reset_contexts(s);
3613
        s->last_spatial_decomposition_type=
3614
        s->last_qlog=
3615
        s->last_qbias=
3616
        s->last_mv_scale=
3617
        s->last_block_max_depth= 0;
3618
    }
3619
    if(s->keyframe){
3620
        put_symbol(&s->c, s->header_state, s->version, 0);
3621
        put_rac(&s->c, s->header_state, s->always_reset);
3622
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3623
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3624
        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3625
        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3626
        put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3627
        put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3628
        put_rac(&s->c, s->header_state, s->spatial_scalability);
3629
//        put_rac(&s->c, s->header_state, s->rate_scalability);
3630
        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3631

    
3632
        for(plane_index=0; plane_index<2; plane_index++){
3633
            for(level=0; level<s->spatial_decomposition_count; level++){
3634
                for(orientation=level ? 1:0; orientation<4; orientation++){
3635
                    if(orientation==2) continue;
3636
                    put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3637
                }
3638
            }
3639
        }
3640
    }
3641
    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3642
    put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3643
    put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3644
    put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3645
    put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3646

    
3647
    s->last_spatial_decomposition_type= s->spatial_decomposition_type;
3648
    s->last_qlog                      = s->qlog;
3649
    s->last_qbias                     = s->qbias;
3650
    s->last_mv_scale                  = s->mv_scale;
3651
    s->last_block_max_depth           = s->block_max_depth;
3652
}
3653

    
3654
static int decode_header(SnowContext *s){
3655
    int plane_index, level, orientation;
3656
    uint8_t kstate[32];
3657

    
3658
    memset(kstate, MID_STATE, sizeof(kstate));
3659

    
3660
    s->keyframe= get_rac(&s->c, kstate);
3661
    if(s->keyframe || s->always_reset){
3662
        reset_contexts(s);
3663
        s->spatial_decomposition_type=
3664
        s->qlog=
3665
        s->qbias=
3666
        s->mv_scale=
3667
        s->block_max_depth= 0;
3668
    }
3669
    if(s->keyframe){
3670
        s->version= get_symbol(&s->c, s->header_state, 0);
3671
        if(s->version>0){
3672
            av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3673
            return -1;
3674
        }
3675
        s->always_reset= get_rac(&s->c, s->header_state);
3676
        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3677
        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3678
        s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3679
        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3680
        s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3681
        s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3682
        s->spatial_scalability= get_rac(&s->c, s->header_state);
3683
//        s->rate_scalability= get_rac(&s->c, s->header_state);
3684
        s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3685

    
3686
        for(plane_index=0; plane_index<3; plane_index++){
3687
            for(level=0; level<s->spatial_decomposition_count; level++){
3688
                for(orientation=level ? 1:0; orientation<4; orientation++){
3689
                    int q;
3690
                    if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3691
                    else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3692
                    else                    q= get_symbol(&s->c, s->header_state, 1);
3693
                    s->plane[plane_index].band[level][orientation].qlog= q;
3694
                }
3695
            }
3696
        }
3697
    }
3698

    
3699
    s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3700
    if(s->spatial_decomposition_type > 2){
3701
        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3702
        return -1;
3703
    }
3704

    
3705
    s->qlog           += get_symbol(&s->c, s->header_state, 1);
3706
    s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3707
    s->qbias          += get_symbol(&s->c, s->header_state, 1);
3708
    s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3709
    if(s->block_max_depth > 1 || s->block_max_depth < 0){
3710
        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3711
        s->block_max_depth= 0;
3712
        return -1;
3713
    }
3714

    
3715
    return 0;
3716
}
3717

    
3718
static void init_qexp(void){
3719
    int i;
3720
    double v=128;
3721

    
3722
    for(i=0; i<QROOT; i++){
3723
        qexp[i]= lrintf(v);
3724
        v *= pow(2, 1.0 / QROOT);
3725
    }
3726
}
3727

    
3728
static int common_init(AVCodecContext *avctx){
3729
    SnowContext *s = avctx->priv_data;
3730
    int width, height;
3731
    int level, orientation, plane_index, dec;
3732
    int i, j;
3733

    
3734
    s->avctx= avctx;
3735

    
3736
    dsputil_init(&s->dsp, avctx);
3737

    
3738
#define mcf(dx,dy)\
3739
    s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3740
    s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3741
        s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3742
    s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3743
    s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3744
        s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3745

    
3746
    mcf( 0, 0)
3747
    mcf( 4, 0)
3748
    mcf( 8, 0)
3749
    mcf(12, 0)
3750
    mcf( 0, 4)
3751
    mcf( 4, 4)
3752
    mcf( 8, 4)
3753
    mcf(12, 4)
3754
    mcf( 0, 8)
3755
    mcf( 4, 8)
3756
    mcf( 8, 8)
3757
    mcf(12, 8)
3758
    mcf( 0,12)
3759
    mcf( 4,12)
3760
    mcf( 8,12)
3761
    mcf(12,12)
3762

    
3763
#define mcfh(dx,dy)\
3764
    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3765
    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3766
        mc_block_hpel ## dx ## dy ## 16;\
3767
    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3768
    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3769
        mc_block_hpel ## dx ## dy ## 8;
3770

    
3771
    mcfh(0, 0)
3772
    mcfh(8, 0)
3773
    mcfh(0, 8)
3774
    mcfh(8, 8)
3775

    
3776
    if(!qexp[0])
3777
        init_qexp();
3778

    
3779
    dec= s->spatial_decomposition_count= 5;
3780
    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3781

    
3782
    s->chroma_h_shift= 1; //FIXME XXX
3783
    s->chroma_v_shift= 1;
3784

    
3785
//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3786

    
3787
    width= s->avctx->width;
3788
    height= s->avctx->height;
3789

    
3790
    s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3791

    
3792
    s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3793
    s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3794

    
3795
    for(plane_index=0; plane_index<3; plane_index++){
3796
        int w= s->avctx->width;
3797
        int h= s->avctx->height;
3798

    
3799
        if(plane_index){
3800
            w>>= s->chroma_h_shift;
3801
            h>>= s->chroma_v_shift;
3802
        }
3803
        s->plane[plane_index].width = w;
3804
        s->plane[plane_index].height= h;
3805
//av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3806
        for(level=s->spatial_decomposition_count-1; level>=0; level--){
3807
            for(orientation=level ? 1 : 0; orientation<4; orientation++){
3808
                SubBand *b= &s->plane[plane_index].band[level][orientation];
3809

    
3810
                b->buf= s->spatial_dwt_buffer;
3811
                b->level= level;
3812
                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3813
                b->width = (w + !(orientation&1))>>1;
3814
                b->height= (h + !(orientation>1))>>1;
3815

    
3816
                b->stride_line = 1 << (s->spatial_decomposition_count - level);
3817
                b->buf_x_offset = 0;
3818
                b->buf_y_offset = 0;
3819

    
3820
                if(orientation&1){
3821
                    b->buf += (w+1)>>1;
3822
                    b->buf_x_offset = (w+1)>>1;
3823
                }
3824
                if(orientation>1){
3825
                    b->buf += b->stride>>1;
3826
                    b->buf_y_offset = b->stride_line >> 1;
3827
                }
3828

    
3829
                if(level)
3830
                    b->parent= &s->plane[plane_index].band[level-1][orientation];
3831
                b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3832
            }
3833
            w= (w+1)>>1;
3834
            h= (h+1)>>1;
3835
        }
3836
    }
3837

    
3838
    for(i=0; i<MAX_REF_FRAMES; i++)
3839
        for(j=0; j<MAX_REF_FRAMES; j++)
3840
            scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3841

    
3842
    reset_contexts(s);
3843
/*
3844
    width= s->width= avctx->width;
3845
    height= s->height= avctx->height;
3846

3847
    assert(width && height);
3848
*/
3849
    s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3850

    
3851
    return 0;
3852
}
3853

    
3854
static int qscale2qlog(int qscale){
3855
    return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3856
           + 61*QROOT/8; //<64 >60
3857
}
3858

    
3859
static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3860
{
3861
    /* estimate the frame's complexity as a sum of weighted dwt coefs.
3862
     * FIXME we know exact mv bits at this point,
3863
     * but ratecontrol isn't set up to include them. */
3864
    uint32_t coef_sum= 0;
3865
    int level, orientation, delta_qlog;
3866

    
3867
    for(level=0; level<s->spatial_decomposition_count; level++){
3868
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3869
            SubBand *b= &s->plane[0].band[level][orientation];
3870
            DWTELEM *buf= b->buf;
3871
            const int w= b->width;
3872
            const int h= b->height;
3873
            const int stride= b->stride;
3874
            const int qlog= clip(2*QROOT + b->qlog, 0, QROOT*16);
3875
            const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3876
            const int qdiv= (1<<16)/qmul;
3877
            int x, y;
3878
            if(orientation==0)
3879
                decorrelate(s, b, buf, stride, 1, 0);
3880
            for(y=0; y<h; y++)
3881
                for(x=0; x<w; x++)
3882
                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3883
            if(orientation==0)
3884
                correlate(s, b, buf, stride, 1, 0);
3885
        }
3886
    }
3887

    
3888
    /* ugly, ratecontrol just takes a sqrt again */
3889
    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3890
    assert(coef_sum < INT_MAX);
3891

    
3892
    if(pict->pict_type == I_TYPE){
3893
        s->m.current_picture.mb_var_sum= coef_sum;
3894
        s->m.current_picture.mc_mb_var_sum= 0;
3895
    }else{
3896
        s->m.current_picture.mc_mb_var_sum= coef_sum;
3897
        s->m.current_picture.mb_var_sum= 0;
3898
    }
3899

    
3900
    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3901
    if (pict->quality < 0)
3902
        return INT_MIN;
3903
    s->lambda= pict->quality * 3/2;
3904
    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3905
    s->qlog+= delta_qlog;
3906
    return delta_qlog;
3907
}
3908

    
3909
static void calculate_vissual_weight(SnowContext *s, Plane *p){
3910
    int width = p->width;
3911
    int height= p->height;
3912
    int level, orientation, x, y;
3913

    
3914
    for(level=0; level<s->spatial_decomposition_count; level++){
3915
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3916
            SubBand *b= &p->band[level][orientation];
3917
            DWTELEM *buf= b->buf;
3918
            int64_t error=0;
3919

    
3920
            memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3921
            buf[b->width/2 + b->height/2*b->stride]= 256*256;
3922
            ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3923
            for(y=0; y<height; y++){
3924
                for(x=0; x<width; x++){
3925
                    int64_t d= s->spatial_dwt_buffer[x + y*width];
3926
                    error += d*d;
3927
                }
3928
            }
3929

    
3930
            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3931
//            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3932
        }
3933
    }
3934
}
3935

    
3936
static int encode_init(AVCodecContext *avctx)
3937
{
3938
    SnowContext *s = avctx->priv_data;
3939
    int plane_index;
3940

    
3941
    if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3942
        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3943
               "use vstrict=-2 / -strict -2 to use it anyway\n");
3944
        return -1;
3945
    }
3946

    
3947
    if(avctx->prediction_method == DWT_97
3948
       && (avctx->flags & CODEC_FLAG_QSCALE)
3949
       && avctx->global_quality == 0){
3950
        av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
3951
        return -1;
3952
    }
3953

    
3954
    common_init(avctx);
3955
    alloc_blocks(s);
3956

    
3957
    s->version=0;
3958

    
3959
    s->m.avctx   = avctx;
3960
    s->m.flags   = avctx->flags;
3961
    s->m.bit_rate= avctx->bit_rate;
3962

    
3963
    s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3964
    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3965
    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3966
    s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3967
    h263_encode_init(&s->m); //mv_penalty
3968

    
3969
    s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
3970

    
3971
    if(avctx->flags&CODEC_FLAG_PASS1){
3972
        if(!avctx->stats_out)
3973
            avctx->stats_out = av_mallocz(256);
3974
    }
3975
    if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
3976
        if(ff_rate_control_init(&s->m) < 0)
3977
            return -1;
3978
    }
3979
    s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
3980

    
3981
    for(plane_index=0; plane_index<3; plane_index++){
3982
        calculate_vissual_weight(s, &s->plane[plane_index]);
3983
    }
3984

    
3985

    
3986
    avctx->coded_frame= &s->current_picture;
3987
    switch(avctx->pix_fmt){
3988
//    case PIX_FMT_YUV444P:
3989
//    case PIX_FMT_YUV422P:
3990
    case PIX_FMT_YUV420P:
3991
    case PIX_FMT_GRAY8:
3992
//    case PIX_FMT_YUV411P:
3993
//    case PIX_FMT_YUV410P:
3994
        s->colorspace_type= 0;
3995
        break;
3996
/*    case PIX_FMT_RGBA32:
3997
        s->colorspace= 1;
3998
        break;*/
3999
    default:
4000
        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
4001
        return -1;
4002
    }
4003
//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
4004
    s->chroma_h_shift= 1;
4005
    s->chroma_v_shift= 1;
4006

    
4007
    ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4008
    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4009

    
4010
    s->avctx->get_buffer(s->avctx, &s->input_picture);
4011

    
4012
    if(s->avctx->me_method == ME_ITER){
4013
        int i;
4014
        int size= s->b_width * s->b_height << 2*s->block_max_depth;
4015
        for(i=0; i<s->max_ref_frames; i++){
4016
            s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4017
            s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4018
        }
4019
    }
4020

    
4021
    return 0;
4022
}
4023

    
4024
static int frame_start(SnowContext *s){
4025
   AVFrame tmp;
4026
   int w= s->avctx->width; //FIXME round up to x16 ?
4027
   int h= s->avctx->height;
4028

    
4029
    if(s->current_picture.data[0]){
4030
        draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4031
        draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4032
        draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4033
    }
4034

    
4035
    tmp= s->last_picture[s->max_ref_frames-1];
4036
    memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4037
    s->last_picture[0]= s->current_picture;
4038
    s->current_picture= tmp;
4039

    
4040
    if(s->keyframe){
4041
        s->ref_frames= 0;
4042
    }else{
4043
        int i;
4044
        for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4045
            if(i && s->last_picture[i-1].key_frame)
4046
                break;
4047
        s->ref_frames= i;
4048