Statistics
| Branch: | Revision:

ffmpeg / libavcodec / snow.c @ 849f1035

History | View | Annotate | Download (167 KB)

1
/*
2
 * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "avcodec.h"
22
#include "common.h"
23
#include "dsputil.h"
24
#include "snow.h"
25

    
26
#include "rangecoder.h"
27

    
28
#include "mpegvideo.h"
29

    
30
#undef NDEBUG
31
#include <assert.h>
32

    
33
static const int8_t quant3[256]={
34
 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
43
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
44
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
45
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
46
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
47
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
48
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
49
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
50
};
51
static const int8_t quant3b[256]={
52
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
61
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
62
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
63
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
64
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
65
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
66
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
67
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
68
};
69
static const int8_t quant3bA[256]={
70
 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
71
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
72
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
73
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
74
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
75
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
76
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
77
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
78
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
79
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
80
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
81
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
82
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
83
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
84
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
85
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
86
};
87
static const int8_t quant5[256]={
88
 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
91
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
92
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
93
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
94
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
95
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
96
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
97
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
98
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
99
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
100
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
101
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
102
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
103
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
104
};
105
static const int8_t quant7[256]={
106
 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
108
 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
109
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
110
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
111
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
112
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
113
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
114
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
115
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
116
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
117
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
118
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
119
-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
120
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
121
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
122
};
123
static const int8_t quant9[256]={
124
 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
125
 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
126
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
127
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
128
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
129
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
130
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
131
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
132
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
133
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
134
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
135
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
136
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
137
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
138
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
139
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
140
};
141
static const int8_t quant11[256]={
142
 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
143
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
144
 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
145
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
146
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
147
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
148
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
149
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
150
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
151
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
152
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
153
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
154
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
155
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
156
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
157
-4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
158
};
159
static const int8_t quant13[256]={
160
 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
161
 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
162
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
163
 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
164
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
165
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
166
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
167
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
168
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
169
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
170
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
171
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
172
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
173
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
174
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
175
-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
176
};
177

    
178
#if 0 //64*cubic
179
static const uint8_t obmc32[1024]={
180
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
182
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
183
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
184
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
185
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
186
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
187
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
188
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
189
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
190
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
191
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
192
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
193
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
194
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
195
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
196
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
197
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
198
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
199
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
200
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
201
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
202
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
203
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
204
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
205
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
206
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
207
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
208
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
209
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
210
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
211
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212
//error:0.000022
213
};
214
static const uint8_t obmc16[256]={
215
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
216
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
217
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
218
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
219
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
220
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
221
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
222
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
223
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
224
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
225
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
226
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
227
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
228
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
229
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
230
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
231
//error:0.000033
232
};
233
#elif 1 // 64*linear
234
static const uint8_t obmc32[1024]={
235
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
236
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
237
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
238
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
239
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
240
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
241
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
242
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
243
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
244
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
245
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
246
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
247
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
248
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
249
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
250
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
251
  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
252
  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
253
  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
254
  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
255
  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
256
  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
257
  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
258
  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
259
  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
260
  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
261
  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
262
  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
263
  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
264
  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
265
  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
266
  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
267
 //error:0.000020
268
};
269
static const uint8_t obmc16[256]={
270
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
271
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
272
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
273
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
274
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
275
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
276
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
277
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
278
 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
279
 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
280
 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
281
  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
282
  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
283
  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
284
  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
285
  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
286
//error:0.000015
287
};
288
#else //64*cos
289
static const uint8_t obmc32[1024]={
290
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
291
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
292
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
293
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
294
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
295
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
296
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
297
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
298
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
299
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
300
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
301
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
302
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
303
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
304
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
305
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
306
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
307
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
308
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
309
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
310
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
311
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
312
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
313
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
314
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
315
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
316
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
317
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
318
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
319
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
320
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
321
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
322
//error:0.000022
323
};
324
static const uint8_t obmc16[256]={
325
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
326
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
327
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
328
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
329
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
330
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
331
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
332
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
333
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
334
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
335
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
336
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
337
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
338
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
339
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
340
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
341
//error:0.000022
342
};
343
#endif
344

    
345
//linear *64
346
static const uint8_t obmc8[64]={
347
  4, 12, 20, 28, 28, 20, 12,  4,
348
 12, 36, 60, 84, 84, 60, 36, 12,
349
 20, 60,100,140,140,100, 60, 20,
350
 28, 84,140,196,196,140, 84, 28,
351
 28, 84,140,196,196,140, 84, 28,
352
 20, 60,100,140,140,100, 60, 20,
353
 12, 36, 60, 84, 84, 60, 36, 12,
354
  4, 12, 20, 28, 28, 20, 12,  4,
355
//error:0.000000
356
};
357

    
358
//linear *64
359
static const uint8_t obmc4[16]={
360
 16, 48, 48, 16,
361
 48,144,144, 48,
362
 48,144,144, 48,
363
 16, 48, 48, 16,
364
//error:0.000000
365
};
366

    
367
static const uint8_t *obmc_tab[4]={
368
    obmc32, obmc16, obmc8, obmc4
369
};
370

    
371
static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
372

    
373
typedef struct BlockNode{
374
    int16_t mx;
375
    int16_t my;
376
    uint8_t ref;
377
    uint8_t color[3];
378
    uint8_t type;
379
//#define TYPE_SPLIT    1
380
#define BLOCK_INTRA   1
381
#define BLOCK_OPT     2
382
//#define TYPE_NOCOLOR  4
383
    uint8_t level; //FIXME merge into type?
384
}BlockNode;
385

    
386
static const BlockNode null_block= { //FIXME add border maybe
387
    .color= {128,128,128},
388
    .mx= 0,
389
    .my= 0,
390
    .ref= 0,
391
    .type= 0,
392
    .level= 0,
393
};
394

    
395
#define LOG2_MB_SIZE 4
396
#define MB_SIZE (1<<LOG2_MB_SIZE)
397

    
398
typedef struct x_and_coeff{
399
    int16_t x;
400
    uint16_t coeff;
401
} x_and_coeff;
402

    
403
typedef struct SubBand{
404
    int level;
405
    int stride;
406
    int width;
407
    int height;
408
    int qlog;                                   ///< log(qscale)/log[2^(1/6)]
409
    DWTELEM *buf;
410
    int buf_x_offset;
411
    int buf_y_offset;
412
    int stride_line; ///< Stride measured in lines, not pixels.
413
    x_and_coeff * x_coeff;
414
    struct SubBand *parent;
415
    uint8_t state[/*7*2*/ 7 + 512][32];
416
}SubBand;
417

    
418
typedef struct Plane{
419
    int width;
420
    int height;
421
    SubBand band[MAX_DECOMPOSITIONS][4];
422
}Plane;
423

    
424
typedef struct SnowContext{
425
//    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
426

    
427
    AVCodecContext *avctx;
428
    RangeCoder c;
429
    DSPContext dsp;
430
    AVFrame new_picture;
431
    AVFrame input_picture;              ///< new_picture with the internal linesizes
432
    AVFrame current_picture;
433
    AVFrame last_picture[MAX_REF_FRAMES];
434
    AVFrame mconly_picture;
435
//     uint8_t q_context[16];
436
    uint8_t header_state[32];
437
    uint8_t block_state[128 + 32*128];
438
    int keyframe;
439
    int always_reset;
440
    int version;
441
    int spatial_decomposition_type;
442
    int temporal_decomposition_type;
443
    int spatial_decomposition_count;
444
    int temporal_decomposition_count;
445
    int max_ref_frames;
446
    int ref_frames;
447
    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
448
    uint32_t *ref_scores[MAX_REF_FRAMES];
449
    DWTELEM *spatial_dwt_buffer;
450
    int colorspace_type;
451
    int chroma_h_shift;
452
    int chroma_v_shift;
453
    int spatial_scalability;
454
    int qlog;
455
    int lambda;
456
    int lambda2;
457
    int pass1_rc;
458
    int mv_scale;
459
    int qbias;
460
#define QBIAS_SHIFT 3
461
    int b_width;
462
    int b_height;
463
    int block_max_depth;
464
    Plane plane[MAX_PLANES];
465
    BlockNode *block;
466
#define ME_CACHE_SIZE 1024
467
    int me_cache[ME_CACHE_SIZE];
468
    int me_cache_generation;
469
    slice_buffer sb;
470

    
471
    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
472
}SnowContext;
473

    
474
typedef struct {
475
    DWTELEM *b0;
476
    DWTELEM *b1;
477
    DWTELEM *b2;
478
    DWTELEM *b3;
479
    int y;
480
} dwt_compose_t;
481

    
482
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
483
//#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
484

    
485
static void iterative_me(SnowContext *s);
486

    
487
static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
488
{
489
    int i;
490

    
491
    buf->base_buffer = base_buffer;
492
    buf->line_count = line_count;
493
    buf->line_width = line_width;
494
    buf->data_count = max_allocated_lines;
495
    buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
496
    buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
497

    
498
    for (i = 0; i < max_allocated_lines; i++)
499
    {
500
      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
501
    }
502

    
503
    buf->data_stack_top = max_allocated_lines - 1;
504
}
505

    
506
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
507
{
508
    int offset;
509
    DWTELEM * buffer;
510

    
511
//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
512

    
513
    assert(buf->data_stack_top >= 0);
514
//  assert(!buf->line[line]);
515
    if (buf->line[line])
516
        return buf->line[line];
517

    
518
    offset = buf->line_width * line;
519
    buffer = buf->data_stack[buf->data_stack_top];
520
    buf->data_stack_top--;
521
    buf->line[line] = buffer;
522

    
523
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
524

    
525
    return buffer;
526
}
527

    
528
static void slice_buffer_release(slice_buffer * buf, int line)
529
{
530
    int offset;
531
    DWTELEM * buffer;
532

    
533
    assert(line >= 0 && line < buf->line_count);
534
    assert(buf->line[line]);
535

    
536
    offset = buf->line_width * line;
537
    buffer = buf->line[line];
538
    buf->data_stack_top++;
539
    buf->data_stack[buf->data_stack_top] = buffer;
540
    buf->line[line] = NULL;
541

    
542
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
543
}
544

    
545
static void slice_buffer_flush(slice_buffer * buf)
546
{
547
    int i;
548
    for (i = 0; i < buf->line_count; i++)
549
    {
550
        if (buf->line[i])
551
        {
552
//      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
553
            slice_buffer_release(buf, i);
554
        }
555
    }
556
}
557

    
558
static void slice_buffer_destroy(slice_buffer * buf)
559
{
560
    int i;
561
    slice_buffer_flush(buf);
562

    
563
    for (i = buf->data_count - 1; i >= 0; i--)
564
    {
565
        assert(buf->data_stack[i]);
566
        av_freep(&buf->data_stack[i]);
567
    }
568
    assert(buf->data_stack);
569
    av_freep(&buf->data_stack);
570
    assert(buf->line);
571
    av_freep(&buf->line);
572
}
573

    
574
#ifdef __sgi
575
// Avoid a name clash on SGI IRIX
576
#undef qexp
577
#endif
578
#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
579
static uint8_t qexp[QROOT];
580

    
581
static inline int mirror(int v, int m){
582
    while((unsigned)v > (unsigned)m){
583
        v=-v;
584
        if(v<0) v+= 2*m;
585
    }
586
    return v;
587
}
588

    
589
static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
590
    int i;
591

    
592
    if(v){
593
        const int a= FFABS(v);
594
        const int e= av_log2(a);
595
#if 1
596
        const int el= FFMIN(e, 10);
597
        put_rac(c, state+0, 0);
598

    
599
        for(i=0; i<el; i++){
600
            put_rac(c, state+1+i, 1);  //1..10
601
        }
602
        for(; i<e; i++){
603
            put_rac(c, state+1+9, 1);  //1..10
604
        }
605
        put_rac(c, state+1+FFMIN(i,9), 0);
606

    
607
        for(i=e-1; i>=el; i--){
608
            put_rac(c, state+22+9, (a>>i)&1); //22..31
609
        }
610
        for(; i>=0; i--){
611
            put_rac(c, state+22+i, (a>>i)&1); //22..31
612
        }
613

    
614
        if(is_signed)
615
            put_rac(c, state+11 + el, v < 0); //11..21
616
#else
617

    
618
        put_rac(c, state+0, 0);
619
        if(e<=9){
620
            for(i=0; i<e; i++){
621
                put_rac(c, state+1+i, 1);  //1..10
622
            }
623
            put_rac(c, state+1+i, 0);
624

    
625
            for(i=e-1; i>=0; i--){
626
                put_rac(c, state+22+i, (a>>i)&1); //22..31
627
            }
628

    
629
            if(is_signed)
630
                put_rac(c, state+11 + e, v < 0); //11..21
631
        }else{
632
            for(i=0; i<e; i++){
633
                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
634
            }
635
            put_rac(c, state+1+FFMIN(i,9), 0);
636

    
637
            for(i=e-1; i>=0; i--){
638
                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
639
            }
640

    
641
            if(is_signed)
642
                put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
643
        }
644
#endif
645
    }else{
646
        put_rac(c, state+0, 1);
647
    }
648
}
649

    
650
static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
651
    if(get_rac(c, state+0))
652
        return 0;
653
    else{
654
        int i, e, a;
655
        e= 0;
656
        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
657
            e++;
658
        }
659

    
660
        a= 1;
661
        for(i=e-1; i>=0; i--){
662
            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
663
        }
664

    
665
        if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
666
            return -a;
667
        else
668
            return a;
669
    }
670
}
671

    
672
static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
673
    int i;
674
    int r= log2>=0 ? 1<<log2 : 1;
675

    
676
    assert(v>=0);
677
    assert(log2>=-4);
678

    
679
    while(v >= r){
680
        put_rac(c, state+4+log2, 1);
681
        v -= r;
682
        log2++;
683
        if(log2>0) r+=r;
684
    }
685
    put_rac(c, state+4+log2, 0);
686

    
687
    for(i=log2-1; i>=0; i--){
688
        put_rac(c, state+31-i, (v>>i)&1);
689
    }
690
}
691

    
692
static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
693
    int i;
694
    int r= log2>=0 ? 1<<log2 : 1;
695
    int v=0;
696

    
697
    assert(log2>=-4);
698

    
699
    while(get_rac(c, state+4+log2)){
700
        v+= r;
701
        log2++;
702
        if(log2>0) r+=r;
703
    }
704

    
705
    for(i=log2-1; i>=0; i--){
706
        v+= get_rac(c, state+31-i)<<i;
707
    }
708

    
709
    return v;
710
}
711

    
712
static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
713
    const int mirror_left= !highpass;
714
    const int mirror_right= (width&1) ^ highpass;
715
    const int w= (width>>1) - 1 + (highpass & width);
716
    int i;
717

    
718
#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
719
    if(mirror_left){
720
        dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
721
        dst += dst_step;
722
        src += src_step;
723
    }
724

    
725
    for(i=0; i<w; i++){
726
        dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
727
    }
728

    
729
    if(mirror_right){
730
        dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
731
    }
732
}
733

    
734
#ifndef lift5
735
static av_always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
736
    const int mirror_left= !highpass;
737
    const int mirror_right= (width&1) ^ highpass;
738
    const int w= (width>>1) - 1 + (highpass & width);
739
    int i;
740

    
741
    if(mirror_left){
742
        int r= 3*2*ref[0];
743
        r += r>>4;
744
        r += r>>8;
745
        dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
746
        dst += dst_step;
747
        src += src_step;
748
    }
749

    
750
    for(i=0; i<w; i++){
751
        int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
752
        r += r>>4;
753
        r += r>>8;
754
        dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
755
    }
756

    
757
    if(mirror_right){
758
        int r= 3*2*ref[w*ref_step];
759
        r += r>>4;
760
        r += r>>8;
761
        dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
762
    }
763
}
764
#endif
765

    
766
#ifndef liftS
767
static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
768
    const int mirror_left= !highpass;
769
    const int mirror_right= (width&1) ^ highpass;
770
    const int w= (width>>1) - 1 + (highpass & width);
771
    int i;
772

    
773
    assert(shift == 4);
774
#define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
775
    if(mirror_left){
776
        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
777
        dst += dst_step;
778
        src += src_step;
779
    }
780

    
781
    for(i=0; i<w; i++){
782
        dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
783
    }
784

    
785
    if(mirror_right){
786
        dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
787
    }
788
}
789
#endif
790

    
791

    
792
static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
793
    int x, i;
794

    
795
    for(x=start; x<width; x+=2){
796
        int64_t sum=0;
797

    
798
        for(i=0; i<n; i++){
799
            int x2= x + 2*i - n + 1;
800
            if     (x2<     0) x2= -x2;
801
            else if(x2>=width) x2= 2*width-x2-2;
802
            sum += coeffs[i]*(int64_t)dst[x2];
803
        }
804
        if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
805
        else        dst[x] += (sum + (1<<shift)/2)>>shift;
806
    }
807
}
808

    
809
static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
810
    int x, y, i;
811
    for(y=start; y<height; y+=2){
812
        for(x=0; x<width; x++){
813
            int64_t sum=0;
814

    
815
            for(i=0; i<n; i++){
816
                int y2= y + 2*i - n + 1;
817
                if     (y2<      0) y2= -y2;
818
                else if(y2>=height) y2= 2*height-y2-2;
819
                sum += coeffs[i]*(int64_t)dst[x + y2*stride];
820
            }
821
            if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
822
            else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
823
        }
824
    }
825
}
826

    
827
#define SCALEX 1
828
#define LX0 0
829
#define LX1 1
830

    
831
#if 0 // more accurate 9/7
832
#define N1 2
833
#define SHIFT1 14
834
#define COEFFS1 (int[]){-25987,-25987}
835
#define N2 2
836
#define SHIFT2 19
837
#define COEFFS2 (int[]){-27777,-27777}
838
#define N3 2
839
#define SHIFT3 15
840
#define COEFFS3 (int[]){28931,28931}
841
#define N4 2
842
#define SHIFT4 15
843
#define COEFFS4 (int[]){14533,14533}
844
#elif 1 // 13/7 CRF
845
#define N1 4
846
#define SHIFT1 4
847
#define COEFFS1 (int[]){1,-9,-9,1}
848
#define N2 4
849
#define SHIFT2 4
850
#define COEFFS2 (int[]){-1,5,5,-1}
851
#define N3 0
852
#define SHIFT3 1
853
#define COEFFS3 NULL
854
#define N4 0
855
#define SHIFT4 1
856
#define COEFFS4 NULL
857
#elif 1 // 3/5
858
#define LX0 1
859
#define LX1 0
860
#define SCALEX 0.5
861
#define N1 2
862
#define SHIFT1 1
863
#define COEFFS1 (int[]){1,1}
864
#define N2 2
865
#define SHIFT2 2
866
#define COEFFS2 (int[]){-1,-1}
867
#define N3 0
868
#define SHIFT3 0
869
#define COEFFS3 NULL
870
#define N4 0
871
#define SHIFT4 0
872
#define COEFFS4 NULL
873
#elif 1 // 11/5
874
#define N1 0
875
#define SHIFT1 1
876
#define COEFFS1 NULL
877
#define N2 2
878
#define SHIFT2 2
879
#define COEFFS2 (int[]){-1,-1}
880
#define N3 2
881
#define SHIFT3 0
882
#define COEFFS3 (int[]){-1,-1}
883
#define N4 4
884
#define SHIFT4 7
885
#define COEFFS4 (int[]){-5,29,29,-5}
886
#define SCALEX 4
887
#elif 1 // 9/7 CDF
888
#define N1 2
889
#define SHIFT1 7
890
#define COEFFS1 (int[]){-203,-203}
891
#define N2 2
892
#define SHIFT2 12
893
#define COEFFS2 (int[]){-217,-217}
894
#define N3 2
895
#define SHIFT3 7
896
#define COEFFS3 (int[]){113,113}
897
#define N4 2
898
#define SHIFT4 9
899
#define COEFFS4 (int[]){227,227}
900
#define SCALEX 1
901
#elif 1 // 7/5 CDF
902
#define N1 0
903
#define SHIFT1 1
904
#define COEFFS1 NULL
905
#define N2 2
906
#define SHIFT2 2
907
#define COEFFS2 (int[]){-1,-1}
908
#define N3 2
909
#define SHIFT3 0
910
#define COEFFS3 (int[]){-1,-1}
911
#define N4 2
912
#define SHIFT4 4
913
#define COEFFS4 (int[]){3,3}
914
#elif 1 // 9/7 MN
915
#define N1 4
916
#define SHIFT1 4
917
#define COEFFS1 (int[]){1,-9,-9,1}
918
#define N2 2
919
#define SHIFT2 2
920
#define COEFFS2 (int[]){1,1}
921
#define N3 0
922
#define SHIFT3 1
923
#define COEFFS3 NULL
924
#define N4 0
925
#define SHIFT4 1
926
#define COEFFS4 NULL
927
#else // 13/7 CRF
928
#define N1 4
929
#define SHIFT1 4
930
#define COEFFS1 (int[]){1,-9,-9,1}
931
#define N2 4
932
#define SHIFT2 4
933
#define COEFFS2 (int[]){-1,5,5,-1}
934
#define N3 0
935
#define SHIFT3 1
936
#define COEFFS3 NULL
937
#define N4 0
938
#define SHIFT4 1
939
#define COEFFS4 NULL
940
#endif
941
static void horizontal_decomposeX(DWTELEM *b, int width){
942
    DWTELEM temp[width];
943
    const int width2= width>>1;
944
    const int w2= (width+1)>>1;
945
    int x;
946

    
947
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
948
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
949
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
950
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
951

    
952
    for(x=0; x<width2; x++){
953
        temp[x   ]= b[2*x    ];
954
        temp[x+w2]= b[2*x + 1];
955
    }
956
    if(width&1)
957
        temp[x   ]= b[2*x    ];
958
    memcpy(b, temp, width*sizeof(int));
959
}
960

    
961
static void horizontal_composeX(DWTELEM *b, int width){
962
    DWTELEM temp[width];
963
    const int width2= width>>1;
964
    int x;
965
    const int w2= (width+1)>>1;
966

    
967
    memcpy(temp, b, width*sizeof(int));
968
    for(x=0; x<width2; x++){
969
        b[2*x    ]= temp[x   ];
970
        b[2*x + 1]= temp[x+w2];
971
    }
972
    if(width&1)
973
        b[2*x    ]= temp[x   ];
974

    
975
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
976
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
977
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
978
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
979
}
980

    
981
static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
982
    int x, y;
983

    
984
    for(y=0; y<height; y++){
985
        for(x=0; x<width; x++){
986
            buffer[y*stride + x] *= SCALEX;
987
        }
988
    }
989

    
990
    for(y=0; y<height; y++){
991
        horizontal_decomposeX(buffer + y*stride, width);
992
    }
993

    
994
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
995
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
996
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
997
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
998
}
999

    
1000
static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
1001
    int x, y;
1002

    
1003
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1004
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1005
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1006
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1007

    
1008
    for(y=0; y<height; y++){
1009
        horizontal_composeX(buffer + y*stride, width);
1010
    }
1011

    
1012
    for(y=0; y<height; y++){
1013
        for(x=0; x<width; x++){
1014
            buffer[y*stride + x] /= SCALEX;
1015
        }
1016
    }
1017
}
1018

    
1019
static void horizontal_decompose53i(DWTELEM *b, int width){
1020
    DWTELEM temp[width];
1021
    const int width2= width>>1;
1022
    int x;
1023
    const int w2= (width+1)>>1;
1024

    
1025
    for(x=0; x<width2; x++){
1026
        temp[x   ]= b[2*x    ];
1027
        temp[x+w2]= b[2*x + 1];
1028
    }
1029
    if(width&1)
1030
        temp[x   ]= b[2*x    ];
1031
#if 0
1032
    {
1033
    int A1,A2,A3,A4;
1034
    A2= temp[1       ];
1035
    A4= temp[0       ];
1036
    A1= temp[0+width2];
1037
    A1 -= (A2 + A4)>>1;
1038
    A4 += (A1 + 1)>>1;
1039
    b[0+width2] = A1;
1040
    b[0       ] = A4;
1041
    for(x=1; x+1<width2; x+=2){
1042
        A3= temp[x+width2];
1043
        A4= temp[x+1     ];
1044
        A3 -= (A2 + A4)>>1;
1045
        A2 += (A1 + A3 + 2)>>2;
1046
        b[x+width2] = A3;
1047
        b[x       ] = A2;
1048

1049
        A1= temp[x+1+width2];
1050
        A2= temp[x+2       ];
1051
        A1 -= (A2 + A4)>>1;
1052
        A4 += (A1 + A3 + 2)>>2;
1053
        b[x+1+width2] = A1;
1054
        b[x+1       ] = A4;
1055
    }
1056
    A3= temp[width-1];
1057
    A3 -= A2;
1058
    A2 += (A1 + A3 + 2)>>2;
1059
    b[width -1] = A3;
1060
    b[width2-1] = A2;
1061
    }
1062
#else
1063
    lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1064
    lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1065
#endif
1066
}
1067

    
1068
static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1069
    int i;
1070

    
1071
    for(i=0; i<width; i++){
1072
        b1[i] -= (b0[i] + b2[i])>>1;
1073
    }
1074
}
1075

    
1076
static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1077
    int i;
1078

    
1079
    for(i=0; i<width; i++){
1080
        b1[i] += (b0[i] + b2[i] + 2)>>2;
1081
    }
1082
}
1083

    
1084
static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1085
    int y;
1086
    DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1087
    DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1088

    
1089
    for(y=-2; y<height; y+=2){
1090
        DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1091
        DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1092

    
1093
{START_TIMER
1094
        if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1095
        if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1096
STOP_TIMER("horizontal_decompose53i")}
1097

    
1098
{START_TIMER
1099
        if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1100
        if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1101
STOP_TIMER("vertical_decompose53i*")}
1102

    
1103
        b0=b2;
1104
        b1=b3;
1105
    }
1106
}
1107

    
1108
static void horizontal_decompose97i(DWTELEM *b, int width){
1109
    DWTELEM temp[width];
1110
    const int w2= (width+1)>>1;
1111

    
1112
    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1113
    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1114
    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1115
    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1116
}
1117

    
1118

    
1119
static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1120
    int i;
1121

    
1122
    for(i=0; i<width; i++){
1123
        b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1124
    }
1125
}
1126

    
1127
static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1128
    int i;
1129

    
1130
    for(i=0; i<width; i++){
1131
#ifdef lift5
1132
        b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1133
#else
1134
        int r= 3*(b0[i] + b2[i]);
1135
        r+= r>>4;
1136
        r+= r>>8;
1137
        b1[i] += (r+W_CO)>>W_CS;
1138
#endif
1139
    }
1140
}
1141

    
1142
static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1143
    int i;
1144

    
1145
    for(i=0; i<width; i++){
1146
#ifdef liftS
1147
        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1148
#else
1149
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1150
#endif
1151
    }
1152
}
1153

    
1154
static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1155
    int i;
1156

    
1157
    for(i=0; i<width; i++){
1158
        b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1159
    }
1160
}
1161

    
1162
static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1163
    int y;
1164
    DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1165
    DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1166
    DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1167
    DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1168

    
1169
    for(y=-4; y<height; y+=2){
1170
        DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1171
        DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1172

    
1173
{START_TIMER
1174
        if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1175
        if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1176
if(width>400){
1177
STOP_TIMER("horizontal_decompose97i")
1178
}}
1179

    
1180
{START_TIMER
1181
        if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1182
        if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1183
        if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1184
        if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1185

    
1186
if(width>400){
1187
STOP_TIMER("vertical_decompose97i")
1188
}}
1189

    
1190
        b0=b2;
1191
        b1=b3;
1192
        b2=b4;
1193
        b3=b5;
1194
    }
1195
}
1196

    
1197
void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1198
    int level;
1199

    
1200
    for(level=0; level<decomposition_count; level++){
1201
        switch(type){
1202
        case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1203
        case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1204
        case DWT_X: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1205
        }
1206
    }
1207
}
1208

    
1209
static void horizontal_compose53i(DWTELEM *b, int width){
1210
    DWTELEM temp[width];
1211
    const int width2= width>>1;
1212
    const int w2= (width+1)>>1;
1213
    int x;
1214

    
1215
#if 0
1216
    int A1,A2,A3,A4;
1217
    A2= temp[1       ];
1218
    A4= temp[0       ];
1219
    A1= temp[0+width2];
1220
    A1 -= (A2 + A4)>>1;
1221
    A4 += (A1 + 1)>>1;
1222
    b[0+width2] = A1;
1223
    b[0       ] = A4;
1224
    for(x=1; x+1<width2; x+=2){
1225
        A3= temp[x+width2];
1226
        A4= temp[x+1     ];
1227
        A3 -= (A2 + A4)>>1;
1228
        A2 += (A1 + A3 + 2)>>2;
1229
        b[x+width2] = A3;
1230
        b[x       ] = A2;
1231

1232
        A1= temp[x+1+width2];
1233
        A2= temp[x+2       ];
1234
        A1 -= (A2 + A4)>>1;
1235
        A4 += (A1 + A3 + 2)>>2;
1236
        b[x+1+width2] = A1;
1237
        b[x+1       ] = A4;
1238
    }
1239
    A3= temp[width-1];
1240
    A3 -= A2;
1241
    A2 += (A1 + A3 + 2)>>2;
1242
    b[width -1] = A3;
1243
    b[width2-1] = A2;
1244
#else
1245
    lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1246
    lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1247
#endif
1248
    for(x=0; x<width2; x++){
1249
        b[2*x    ]= temp[x   ];
1250
        b[2*x + 1]= temp[x+w2];
1251
    }
1252
    if(width&1)
1253
        b[2*x    ]= temp[x   ];
1254
}
1255

    
1256
static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1257
    int i;
1258

    
1259
    for(i=0; i<width; i++){
1260
        b1[i] += (b0[i] + b2[i])>>1;
1261
    }
1262
}
1263

    
1264
static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1265
    int i;
1266

    
1267
    for(i=0; i<width; i++){
1268
        b1[i] -= (b0[i] + b2[i] + 2)>>2;
1269
    }
1270
}
1271

    
1272
static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1273
    cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1274
    cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1275
    cs->y = -1;
1276
}
1277

    
1278
static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1279
    cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1280
    cs->b1 = buffer + mirror(-1  , height-1)*stride;
1281
    cs->y = -1;
1282
}
1283

    
1284
static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1285
    int y= cs->y;
1286

    
1287
    DWTELEM *b0= cs->b0;
1288
    DWTELEM *b1= cs->b1;
1289
    DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1290
    DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1291

    
1292
{START_TIMER
1293
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1294
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1295
STOP_TIMER("vertical_compose53i*")}
1296

    
1297
{START_TIMER
1298
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1299
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1300
STOP_TIMER("horizontal_compose53i")}
1301

    
1302
    cs->b0 = b2;
1303
    cs->b1 = b3;
1304
    cs->y += 2;
1305
}
1306

    
1307
static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1308
    int y= cs->y;
1309
    DWTELEM *b0= cs->b0;
1310
    DWTELEM *b1= cs->b1;
1311
    DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1312
    DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1313

    
1314
{START_TIMER
1315
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1316
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1317
STOP_TIMER("vertical_compose53i*")}
1318

    
1319
{START_TIMER
1320
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1321
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1322
STOP_TIMER("horizontal_compose53i")}
1323

    
1324
    cs->b0 = b2;
1325
    cs->b1 = b3;
1326
    cs->y += 2;
1327
}
1328

    
1329
static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1330
    dwt_compose_t cs;
1331
    spatial_compose53i_init(&cs, buffer, height, stride);
1332
    while(cs.y <= height)
1333
        spatial_compose53i_dy(&cs, buffer, width, height, stride);
1334
}
1335

    
1336

    
1337
void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1338
    DWTELEM temp[width];
1339
    const int w2= (width+1)>>1;
1340

    
1341
    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1342
    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1343
    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1344
    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1345
}
1346

    
1347
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1348
    int i;
1349

    
1350
    for(i=0; i<width; i++){
1351
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1352
    }
1353
}
1354

    
1355
static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1356
    int i;
1357

    
1358
    for(i=0; i<width; i++){
1359
#ifdef lift5
1360
        b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1361
#else
1362
        int r= 3*(b0[i] + b2[i]);
1363
        r+= r>>4;
1364
        r+= r>>8;
1365
        b1[i] -= (r+W_CO)>>W_CS;
1366
#endif
1367
    }
1368
}
1369

    
1370
static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1371
    int i;
1372

    
1373
    for(i=0; i<width; i++){
1374
#ifdef liftS
1375
        b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1376
#else
1377
        b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1378
#endif
1379
    }
1380
}
1381

    
1382
static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1383
    int i;
1384

    
1385
    for(i=0; i<width; i++){
1386
        b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1387
    }
1388
}
1389

    
1390
void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1391
    int i;
1392

    
1393
    for(i=0; i<width; i++){
1394
#ifndef lift5
1395
        int r;
1396
#endif
1397
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1398
#ifdef lift5
1399
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1400
#else
1401
        r= 3*(b2[i] + b4[i]);
1402
        r+= r>>4;
1403
        r+= r>>8;
1404
        b3[i] -= (r+W_CO)>>W_CS;
1405
#endif
1406
#ifdef liftS
1407
        b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1408
#else
1409
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1410
#endif
1411
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1412
    }
1413
}
1414

    
1415
static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1416
    cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1417
    cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1418
    cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1419
    cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1420
    cs->y = -3;
1421
}
1422

    
1423
static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1424
    cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1425
    cs->b1 = buffer + mirror(-3  , height-1)*stride;
1426
    cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1427
    cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1428
    cs->y = -3;
1429
}
1430

    
1431
static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1432
    int y = cs->y;
1433

    
1434
    DWTELEM *b0= cs->b0;
1435
    DWTELEM *b1= cs->b1;
1436
    DWTELEM *b2= cs->b2;
1437
    DWTELEM *b3= cs->b3;
1438
    DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1439
    DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1440

    
1441
{START_TIMER
1442
    if(y>0 && y+4<height){
1443
        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1444
    }else{
1445
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1446
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1447
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1448
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1449
    }
1450
if(width>400){
1451
STOP_TIMER("vertical_compose97i")}}
1452

    
1453
{START_TIMER
1454
        if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1455
        if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1456
if(width>400 && y+0<(unsigned)height){
1457
STOP_TIMER("horizontal_compose97i")}}
1458

    
1459
    cs->b0=b2;
1460
    cs->b1=b3;
1461
    cs->b2=b4;
1462
    cs->b3=b5;
1463
    cs->y += 2;
1464
}
1465

    
1466
static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1467
    int y = cs->y;
1468
    DWTELEM *b0= cs->b0;
1469
    DWTELEM *b1= cs->b1;
1470
    DWTELEM *b2= cs->b2;
1471
    DWTELEM *b3= cs->b3;
1472
    DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1473
    DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1474

    
1475
{START_TIMER
1476
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1477
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1478
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1479
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1480
if(width>400){
1481
STOP_TIMER("vertical_compose97i")}}
1482

    
1483
{START_TIMER
1484
        if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1485
        if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1486
if(width>400 && b0 <= b2){
1487
STOP_TIMER("horizontal_compose97i")}}
1488

    
1489
    cs->b0=b2;
1490
    cs->b1=b3;
1491
    cs->b2=b4;
1492
    cs->b3=b5;
1493
    cs->y += 2;
1494
}
1495

    
1496
static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1497
    dwt_compose_t cs;
1498
    spatial_compose97i_init(&cs, buffer, height, stride);
1499
    while(cs.y <= height)
1500
        spatial_compose97i_dy(&cs, buffer, width, height, stride);
1501
}
1502

    
1503
static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1504
    int level;
1505
    for(level=decomposition_count-1; level>=0; level--){
1506
        switch(type){
1507
        case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1508
        case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1509
        /* not slicified yet */
1510
        case DWT_X: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1511
          av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1512
        }
1513
    }
1514
}
1515

    
1516
static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1517
    int level;
1518
    for(level=decomposition_count-1; level>=0; level--){
1519
        switch(type){
1520
        case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1521
        case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1522
        /* not slicified yet */
1523
        case DWT_X: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1524
        }
1525
    }
1526
}
1527

    
1528
static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1529
    const int support = type==1 ? 3 : 5;
1530
    int level;
1531
    if(type==2) return;
1532

    
1533
    for(level=decomposition_count-1; level>=0; level--){
1534
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1535
            switch(type){
1536
            case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1537
                    break;
1538
            case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1539
                    break;
1540
            case DWT_X: break;
1541
            }
1542
        }
1543
    }
1544
}
1545

    
1546
static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1547
    const int support = type==1 ? 3 : 5;
1548
    int level;
1549
    if(type==2) return;
1550

    
1551
    for(level=decomposition_count-1; level>=0; level--){
1552
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1553
            switch(type){
1554
            case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1555
                    break;
1556
            case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1557
                    break;
1558
            case DWT_X: break;
1559
            }
1560
        }
1561
    }
1562
}
1563

    
1564
static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1565
    if(type==2){
1566
        int level;
1567
        for(level=decomposition_count-1; level>=0; level--)
1568
            spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1569
    }else{
1570
        dwt_compose_t cs[MAX_DECOMPOSITIONS];
1571
        int y;
1572
        ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1573
        for(y=0; y<height; y+=4)
1574
            ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1575
    }
1576
}
1577

    
1578
static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1579
    const int w= b->width;
1580
    const int h= b->height;
1581
    int x, y;
1582

    
1583
    if(1){
1584
        int run=0;
1585
        int runs[w*h];
1586
        int run_index=0;
1587
        int max_index;
1588

    
1589
        for(y=0; y<h; y++){
1590
            for(x=0; x<w; x++){
1591
                int v, p=0;
1592
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1593
                v= src[x + y*stride];
1594

    
1595
                if(y){
1596
                    t= src[x + (y-1)*stride];
1597
                    if(x){
1598
                        lt= src[x - 1 + (y-1)*stride];
1599
                    }
1600
                    if(x + 1 < w){
1601
                        rt= src[x + 1 + (y-1)*stride];
1602
                    }
1603
                }
1604
                if(x){
1605
                    l= src[x - 1 + y*stride];
1606
                    /*if(x > 1){
1607
                        if(orientation==1) ll= src[y + (x-2)*stride];
1608
                        else               ll= src[x - 2 + y*stride];
1609
                    }*/
1610
                }
1611
                if(parent){
1612
                    int px= x>>1;
1613
                    int py= y>>1;
1614
                    if(px<b->parent->width && py<b->parent->height)
1615
                        p= parent[px + py*2*stride];
1616
                }
1617
                if(!(/*ll|*/l|lt|t|rt|p)){
1618
                    if(v){
1619
                        runs[run_index++]= run;
1620
                        run=0;
1621
                    }else{
1622
                        run++;
1623
                    }
1624
                }
1625
            }
1626
        }
1627
        max_index= run_index;
1628
        runs[run_index++]= run;
1629
        run_index=0;
1630
        run= runs[run_index++];
1631

    
1632
        put_symbol2(&s->c, b->state[30], max_index, 0);
1633
        if(run_index <= max_index)
1634
            put_symbol2(&s->c, b->state[1], run, 3);
1635

    
1636
        for(y=0; y<h; y++){
1637
            if(s->c.bytestream_end - s->c.bytestream < w*40){
1638
                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1639
                return -1;
1640
            }
1641
            for(x=0; x<w; x++){
1642
                int v, p=0;
1643
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1644
                v= src[x + y*stride];
1645

    
1646
                if(y){
1647
                    t= src[x + (y-1)*stride];
1648
                    if(x){
1649
                        lt= src[x - 1 + (y-1)*stride];
1650
                    }
1651
                    if(x + 1 < w){
1652
                        rt= src[x + 1 + (y-1)*stride];
1653
                    }
1654
                }
1655
                if(x){
1656
                    l= src[x - 1 + y*stride];
1657
                    /*if(x > 1){
1658
                        if(orientation==1) ll= src[y + (x-2)*stride];
1659
                        else               ll= src[x - 2 + y*stride];
1660
                    }*/
1661
                }
1662
                if(parent){
1663
                    int px= x>>1;
1664
                    int py= y>>1;
1665
                    if(px<b->parent->width && py<b->parent->height)
1666
                        p= parent[px + py*2*stride];
1667
                }
1668
                if(/*ll|*/l|lt|t|rt|p){
1669
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1670

    
1671
                    put_rac(&s->c, &b->state[0][context], !!v);
1672
                }else{
1673
                    if(!run){
1674
                        run= runs[run_index++];
1675

    
1676
                        if(run_index <= max_index)
1677
                            put_symbol2(&s->c, b->state[1], run, 3);
1678
                        assert(v);
1679
                    }else{
1680
                        run--;
1681
                        assert(!v);
1682
                    }
1683
                }
1684
                if(v){
1685
                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1686
                    int l2= 2*FFABS(l) + (l<0);
1687
                    int t2= 2*FFABS(t) + (t<0);
1688

    
1689
                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1690
                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1691
                }
1692
            }
1693
        }
1694
    }
1695
    return 0;
1696
}
1697

    
1698
static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1699
//    encode_subband_qtree(s, b, src, parent, stride, orientation);
1700
//    encode_subband_z0run(s, b, src, parent, stride, orientation);
1701
    return encode_subband_c0run(s, b, src, parent, stride, orientation);
1702
//    encode_subband_dzr(s, b, src, parent, stride, orientation);
1703
}
1704

    
1705
static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1706
    const int w= b->width;
1707
    const int h= b->height;
1708
    int x,y;
1709

    
1710
    if(1){
1711
        int run, runs;
1712
        x_and_coeff *xc= b->x_coeff;
1713
        x_and_coeff *prev_xc= NULL;
1714
        x_and_coeff *prev2_xc= xc;
1715
        x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1716
        x_and_coeff *prev_parent_xc= parent_xc;
1717

    
1718
        runs= get_symbol2(&s->c, b->state[30], 0);
1719
        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1720
        else           run= INT_MAX;
1721

    
1722
        for(y=0; y<h; y++){
1723
            int v=0;
1724
            int lt=0, t=0, rt=0;
1725

    
1726
            if(y && prev_xc->x == 0){
1727
                rt= prev_xc->coeff;
1728
            }
1729
            for(x=0; x<w; x++){
1730
                int p=0;
1731
                const int l= v;
1732

    
1733
                lt= t; t= rt;
1734

    
1735
                if(y){
1736
                    if(prev_xc->x <= x)
1737
                        prev_xc++;
1738
                    if(prev_xc->x == x + 1)
1739
                        rt= prev_xc->coeff;
1740
                    else
1741
                        rt=0;
1742
                }
1743
                if(parent_xc){
1744
                    if(x>>1 > parent_xc->x){
1745
                        parent_xc++;
1746
                    }
1747
                    if(x>>1 == parent_xc->x){
1748
                        p= parent_xc->coeff;
1749
                    }
1750
                }
1751
                if(/*ll|*/l|lt|t|rt|p){
1752
                    int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1753

    
1754
                    v=get_rac(&s->c, &b->state[0][context]);
1755
                    if(v){
1756
                        v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1757
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1758

    
1759
                        xc->x=x;
1760
                        (xc++)->coeff= v;
1761
                    }
1762
                }else{
1763
                    if(!run){
1764
                        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1765
                        else           run= INT_MAX;
1766
                        v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1767
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1768

    
1769
                        xc->x=x;
1770
                        (xc++)->coeff= v;
1771
                    }else{
1772
                        int max_run;
1773
                        run--;
1774
                        v=0;
1775

    
1776
                        if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1777
                        else  max_run= FFMIN(run, w-x-1);
1778
                        if(parent_xc)
1779
                            max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1780
                        x+= max_run;
1781
                        run-= max_run;
1782
                    }
1783
                }
1784
            }
1785
            (xc++)->x= w+1; //end marker
1786
            prev_xc= prev2_xc;
1787
            prev2_xc= xc;
1788

    
1789
            if(parent_xc){
1790
                if(y&1){
1791
                    while(parent_xc->x != parent->width+1)
1792
                        parent_xc++;
1793
                    parent_xc++;
1794
                    prev_parent_xc= parent_xc;
1795
                }else{
1796
                    parent_xc= prev_parent_xc;
1797
                }
1798
            }
1799
        }
1800

    
1801
        (xc++)->x= w+1; //end marker
1802
    }
1803
}
1804

    
1805
static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1806
    const int w= b->width;
1807
    int y;
1808
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
1809
    int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1810
    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1811
    int new_index = 0;
1812

    
1813
    START_TIMER
1814

    
1815
    if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1816
        qadd= 0;
1817
        qmul= 1<<QEXPSHIFT;
1818
    }
1819

    
1820
    /* If we are on the second or later slice, restore our index. */
1821
    if (start_y != 0)
1822
        new_index = save_state[0];
1823

    
1824

    
1825
    for(y=start_y; y<h; y++){
1826
        int x = 0;
1827
        int v;
1828
        DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1829
        memset(line, 0, b->width*sizeof(DWTELEM));
1830
        v = b->x_coeff[new_index].coeff;
1831
        x = b->x_coeff[new_index++].x;
1832
        while(x < w)
1833
        {
1834
            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1835
            register int u= -(v&1);
1836
            line[x] = (t^u) - u;
1837

    
1838
            v = b->x_coeff[new_index].coeff;
1839
            x = b->x_coeff[new_index++].x;
1840
        }
1841
    }
1842
    if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1843
        STOP_TIMER("decode_subband")
1844
    }
1845

    
1846
    /* Save our variables for the next slice. */
1847
    save_state[0] = new_index;
1848

    
1849
    return;
1850
}
1851

    
1852
static void reset_contexts(SnowContext *s){
1853
    int plane_index, level, orientation;
1854

    
1855
    for(plane_index=0; plane_index<3; plane_index++){
1856
        for(level=0; level<s->spatial_decomposition_count; level++){
1857
            for(orientation=level ? 1:0; orientation<4; orientation++){
1858
                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1859
            }
1860
        }
1861
    }
1862
    memset(s->header_state, MID_STATE, sizeof(s->header_state));
1863
    memset(s->block_state, MID_STATE, sizeof(s->block_state));
1864
}
1865

    
1866
static int alloc_blocks(SnowContext *s){
1867
    int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1868
    int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1869

    
1870
    s->b_width = w;
1871
    s->b_height= h;
1872

    
1873
    s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1874
    return 0;
1875
}
1876

    
1877
static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1878
    uint8_t *bytestream= d->bytestream;
1879
    uint8_t *bytestream_start= d->bytestream_start;
1880
    *d= *s;
1881
    d->bytestream= bytestream;
1882
    d->bytestream_start= bytestream_start;
1883
}
1884

    
1885
//near copy & paste from dsputil, FIXME
1886
static int pix_sum(uint8_t * pix, int line_size, int w)
1887
{
1888
    int s, i, j;
1889

    
1890
    s = 0;
1891
    for (i = 0; i < w; i++) {
1892
        for (j = 0; j < w; j++) {
1893
            s += pix[0];
1894
            pix ++;
1895
        }
1896
        pix += line_size - w;
1897
    }
1898
    return s;
1899
}
1900

    
1901
//near copy & paste from dsputil, FIXME
1902
static int pix_norm1(uint8_t * pix, int line_size, int w)
1903
{
1904
    int s, i, j;
1905
    uint32_t *sq = ff_squareTbl + 256;
1906

    
1907
    s = 0;
1908
    for (i = 0; i < w; i++) {
1909
        for (j = 0; j < w; j ++) {
1910
            s += sq[pix[0]];
1911
            pix ++;
1912
        }
1913
        pix += line_size - w;
1914
    }
1915
    return s;
1916
}
1917

    
1918
static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1919
    const int w= s->b_width << s->block_max_depth;
1920
    const int rem_depth= s->block_max_depth - level;
1921
    const int index= (x + y*w) << rem_depth;
1922
    const int block_w= 1<<rem_depth;
1923
    BlockNode block;
1924
    int i,j;
1925

    
1926
    block.color[0]= l;
1927
    block.color[1]= cb;
1928
    block.color[2]= cr;
1929
    block.mx= mx;
1930
    block.my= my;
1931
    block.ref= ref;
1932
    block.type= type;
1933
    block.level= level;
1934

    
1935
    for(j=0; j<block_w; j++){
1936
        for(i=0; i<block_w; i++){
1937
            s->block[index + i + j*w]= block;
1938
        }
1939
    }
1940
}
1941

    
1942
static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1943
    const int offset[3]= {
1944
          y*c->  stride + x,
1945
        ((y*c->uvstride + x)>>1),
1946
        ((y*c->uvstride + x)>>1),
1947
    };
1948
    int i;
1949
    for(i=0; i<3; i++){
1950
        c->src[0][i]= src [i];
1951
        c->ref[0][i]= ref [i] + offset[i];
1952
    }
1953
    assert(!ref_index);
1954
}
1955

    
1956
static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1957
                           BlockNode *left, BlockNode *top, BlockNode *tr){
1958
    if(s->ref_frames == 1){
1959
        *mx = mid_pred(left->mx, top->mx, tr->mx);
1960
        *my = mid_pred(left->my, top->my, tr->my);
1961
    }else{
1962
        const int *scale = scale_mv_ref[ref];
1963
        *mx = mid_pred(left->mx * scale[left->ref] + 128 >>8,
1964
                       top ->mx * scale[top ->ref] + 128 >>8,
1965
                       tr  ->mx * scale[tr  ->ref] + 128 >>8);
1966
        *my = mid_pred(left->my * scale[left->ref] + 128 >>8,
1967
                       top ->my * scale[top ->ref] + 128 >>8,
1968
                       tr  ->my * scale[tr  ->ref] + 128 >>8);
1969
    }
1970
}
1971

    
1972
//FIXME copy&paste
1973
#define P_LEFT P[1]
1974
#define P_TOP P[2]
1975
#define P_TOPRIGHT P[3]
1976
#define P_MEDIAN P[4]
1977
#define P_MV1 P[9]
1978
#define FLAG_QPEL   1 //must be 1
1979

    
1980
static int encode_q_branch(SnowContext *s, int level, int x, int y){
1981
    uint8_t p_buffer[1024];
1982
    uint8_t i_buffer[1024];
1983
    uint8_t p_state[sizeof(s->block_state)];
1984
    uint8_t i_state[sizeof(s->block_state)];
1985
    RangeCoder pc, ic;
1986
    uint8_t *pbbak= s->c.bytestream;
1987
    uint8_t *pbbak_start= s->c.bytestream_start;
1988
    int score, score2, iscore, i_len, p_len, block_s, sum;
1989
    const int w= s->b_width  << s->block_max_depth;
1990
    const int h= s->b_height << s->block_max_depth;
1991
    const int rem_depth= s->block_max_depth - level;
1992
    const int index= (x + y*w) << rem_depth;
1993
    const int block_w= 1<<(LOG2_MB_SIZE - level);
1994
    int trx= (x+1)<<rem_depth;
1995
    int try= (y+1)<<rem_depth;
1996
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
1997
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
1998
    BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
1999
    BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2000
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2001
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2002
    int pl = left->color[0];
2003
    int pcb= left->color[1];
2004
    int pcr= left->color[2];
2005
    int pmx, pmy;
2006
    int mx=0, my=0;
2007
    int l,cr,cb;
2008
    const int stride= s->current_picture.linesize[0];
2009
    const int uvstride= s->current_picture.linesize[1];
2010
    uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2011
                                s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2012
                                s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2013
    int P[10][2];
2014
    int16_t last_mv[3][2];
2015
    int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2016
    const int shift= 1+qpel;
2017
    MotionEstContext *c= &s->m.me;
2018
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2019
    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2020
    int my_context= av_log2(2*FFABS(left->my - top->my));
2021
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2022
    int ref, best_ref, ref_score, ref_mx, ref_my;
2023

    
2024
    assert(sizeof(s->block_state) >= 256);
2025
    if(s->keyframe){
2026
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2027
        return 0;
2028
    }
2029

    
2030
//    clip predictors / edge ?
2031

    
2032
    P_LEFT[0]= left->mx;
2033
    P_LEFT[1]= left->my;
2034
    P_TOP [0]= top->mx;
2035
    P_TOP [1]= top->my;
2036
    P_TOPRIGHT[0]= tr->mx;
2037
    P_TOPRIGHT[1]= tr->my;
2038

    
2039
    last_mv[0][0]= s->block[index].mx;
2040
    last_mv[0][1]= s->block[index].my;
2041
    last_mv[1][0]= right->mx;
2042
    last_mv[1][1]= right->my;
2043
    last_mv[2][0]= bottom->mx;
2044
    last_mv[2][1]= bottom->my;
2045

    
2046
    s->m.mb_stride=2;
2047
    s->m.mb_x=
2048
    s->m.mb_y= 0;
2049
    s->m.me.skip= 0;
2050

    
2051
    assert(s->m.me.  stride ==   stride);
2052
    assert(s->m.me.uvstride == uvstride);
2053

    
2054
    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2055
    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2056
    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2057
    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2058

    
2059
    c->xmin = - x*block_w - 16+2;
2060
    c->ymin = - y*block_w - 16+2;
2061
    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2062
    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2063

    
2064
    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2065
    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2066
    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2067
    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2068
    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2069
    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2070
    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2071

    
2072
    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2073
    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2074

    
2075
    if (!y) {
2076
        c->pred_x= P_LEFT[0];
2077
        c->pred_y= P_LEFT[1];
2078
    } else {
2079
        c->pred_x = P_MEDIAN[0];
2080
        c->pred_y = P_MEDIAN[1];
2081
    }
2082

    
2083
    score= INT_MAX;
2084
    best_ref= 0;
2085
    for(ref=0; ref<s->ref_frames; ref++){
2086
        init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2087

    
2088
        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2089
                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2090

    
2091
        assert(ref_mx >= c->xmin);
2092
        assert(ref_mx <= c->xmax);
2093
        assert(ref_my >= c->ymin);
2094
        assert(ref_my <= c->ymax);
2095

    
2096
        ref_score= s->m.me.sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2097
        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2098
        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2099
        if(s->ref_mvs[ref]){
2100
            s->ref_mvs[ref][index][0]= ref_mx;
2101
            s->ref_mvs[ref][index][1]= ref_my;
2102
            s->ref_scores[ref][index]= ref_score;
2103
        }
2104
        if(score > ref_score){
2105
            score= ref_score;
2106
            best_ref= ref;
2107
            mx= ref_mx;
2108
            my= ref_my;
2109
        }
2110
    }
2111
    //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
2112

    
2113
  //  subpel search
2114
    pc= s->c;
2115
    pc.bytestream_start=
2116
    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2117
    memcpy(p_state, s->block_state, sizeof(s->block_state));
2118

    
2119
    if(level!=s->block_max_depth)
2120
        put_rac(&pc, &p_state[4 + s_context], 1);
2121
    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2122
    if(s->ref_frames > 1)
2123
        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2124
    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
2125
    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2126
    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2127
    p_len= pc.bytestream - pc.bytestream_start;
2128
    score += (s->lambda2*(p_len*8
2129
              + (pc.outstanding_count - s->c.outstanding_count)*8
2130
              + (-av_log2(pc.range)    + av_log2(s->c.range))
2131
             ))>>FF_LAMBDA_SHIFT;
2132

    
2133
    block_s= block_w*block_w;
2134
    sum = pix_sum(current_data[0], stride, block_w);
2135
    l= (sum + block_s/2)/block_s;
2136
    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2137

    
2138
    block_s= block_w*block_w>>2;
2139
    sum = pix_sum(current_data[1], uvstride, block_w>>1);
2140
    cb= (sum + block_s/2)/block_s;
2141
//    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2142
    sum = pix_sum(current_data[2], uvstride, block_w>>1);
2143
    cr= (sum + block_s/2)/block_s;
2144
//    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2145

    
2146
    ic= s->c;
2147
    ic.bytestream_start=
2148
    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2149
    memcpy(i_state, s->block_state, sizeof(s->block_state));
2150
    if(level!=s->block_max_depth)
2151
        put_rac(&ic, &i_state[4 + s_context], 1);
2152
    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2153
    put_symbol(&ic, &i_state[32],  l-pl , 1);
2154
    put_symbol(&ic, &i_state[64], cb-pcb, 1);
2155
    put_symbol(&ic, &i_state[96], cr-pcr, 1);
2156
    i_len= ic.bytestream - ic.bytestream_start;
2157
    iscore += (s->lambda2*(i_len*8
2158
              + (ic.outstanding_count - s->c.outstanding_count)*8
2159
              + (-av_log2(ic.range)    + av_log2(s->c.range))
2160
             ))>>FF_LAMBDA_SHIFT;
2161

    
2162
//    assert(score==256*256*256*64-1);
2163
    assert(iscore < 255*255*256 + s->lambda2*10);
2164
    assert(iscore >= 0);
2165
    assert(l>=0 && l<=255);
2166
    assert(pl>=0 && pl<=255);
2167

    
2168
    if(level==0){
2169
        int varc= iscore >> 8;
2170
        int vard= score >> 8;
2171
        if (vard <= 64 || vard < varc)
2172
            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2173
        else
2174
            c->scene_change_score+= s->m.qscale;
2175
    }
2176

    
2177
    if(level!=s->block_max_depth){
2178
        put_rac(&s->c, &s->block_state[4 + s_context], 0);
2179
        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2180
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2181
        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2182
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2183
        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2184

    
2185
        if(score2 < score && score2 < iscore)
2186
            return score2;
2187
    }
2188

    
2189
    if(iscore < score){
2190
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2191
        memcpy(pbbak, i_buffer, i_len);
2192
        s->c= ic;
2193
        s->c.bytestream_start= pbbak_start;
2194
        s->c.bytestream= pbbak + i_len;
2195
        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2196
        memcpy(s->block_state, i_state, sizeof(s->block_state));
2197
        return iscore;
2198
    }else{
2199
        memcpy(pbbak, p_buffer, p_len);
2200
        s->c= pc;
2201
        s->c.bytestream_start= pbbak_start;
2202
        s->c.bytestream= pbbak + p_len;
2203
        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2204
        memcpy(s->block_state, p_state, sizeof(s->block_state));
2205
        return score;
2206
    }
2207
}
2208

    
2209
static av_always_inline int same_block(BlockNode *a, BlockNode *b){
2210
    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2211
        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2212
    }else{
2213
        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2214
    }
2215
}
2216

    
2217
static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2218
    const int w= s->b_width  << s->block_max_depth;
2219
    const int rem_depth= s->block_max_depth - level;
2220
    const int index= (x + y*w) << rem_depth;
2221
    int trx= (x+1)<<rem_depth;
2222
    BlockNode *b= &s->block[index];
2223
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2224
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
2225
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2226
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2227
    int pl = left->color[0];
2228
    int pcb= left->color[1];
2229
    int pcr= left->color[2];
2230
    int pmx, pmy;
2231
    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2232
    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2233
    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2234
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2235

    
2236
    if(s->keyframe){
2237
        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2238
        return;
2239
    }
2240

    
2241
    if(level!=s->block_max_depth){
2242
        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2243
            put_rac(&s->c, &s->block_state[4 + s_context], 1);
2244
        }else{
2245
            put_rac(&s->c, &s->block_state[4 + s_context], 0);
2246
            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2247
            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2248
            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2249
            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2250
            return;
2251
        }
2252
    }
2253
    if(b->type & BLOCK_INTRA){
2254
        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2255
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2256
        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2257
        put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2258
        put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2259
        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2260
    }else{
2261
        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2262
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2263
        if(s->ref_frames > 1)
2264
            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2265
        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2266
        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2267
        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2268
    }
2269
}
2270

    
2271
static void decode_q_branch(SnowContext *s, int level, int x, int y){
2272
    const int w= s->b_width << s->block_max_depth;
2273
    const int rem_depth= s->block_max_depth - level;
2274
    const int index= (x + y*w) << rem_depth;
2275
    int trx= (x+1)<<rem_depth;
2276
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2277
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
2278
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2279
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2280
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2281

    
2282
    if(s->keyframe){
2283
        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2284
        return;
2285
    }
2286

    
2287
    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2288
        int type;
2289
        int l = left->color[0];
2290
        int cb= left->color[1];
2291
        int cr= left->color[2];
2292
        int mx= mid_pred(left->mx, top->mx, tr->mx);
2293
        int my= mid_pred(left->my, top->my, tr->my);
2294
        int ref = 0;
2295
        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2296
        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2297
        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2298

    
2299
        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2300

    
2301
        if(type){
2302
            pred_mv(s, &mx, &my, 0, left, top, tr);
2303
            l += get_symbol(&s->c, &s->block_state[32], 1);
2304
            cb+= get_symbol(&s->c, &s->block_state[64], 1);
2305
            cr+= get_symbol(&s->c, &s->block_state[96], 1);
2306
        }else{
2307
            if(s->ref_frames > 1)
2308
                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2309
            pred_mv(s, &mx, &my, ref, left, top, tr);
2310
            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2311
            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2312
        }
2313
        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2314
    }else{
2315
        decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2316
        decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2317
        decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2318
        decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2319
    }
2320
}
2321

    
2322
static void encode_blocks(SnowContext *s, int search){
2323
    int x, y;
2324
    int w= s->b_width;
2325
    int h= s->b_height;
2326

    
2327
    if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2328
        iterative_me(s);
2329

    
2330
    for(y=0; y<h; y++){
2331
        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2332
            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2333
            return;
2334
        }
2335
        for(x=0; x<w; x++){
2336
            if(s->avctx->me_method == ME_ITER || !search)
2337
                encode_q_branch2(s, 0, x, y);
2338
            else
2339
                encode_q_branch (s, 0, x, y);
2340
        }
2341
    }
2342
}
2343

    
2344
static void decode_blocks(SnowContext *s){
2345
    int x, y;
2346
    int w= s->b_width;
2347
    int h= s->b_height;
2348

    
2349
    for(y=0; y<h; y++){
2350
        for(x=0; x<w; x++){
2351
            decode_q_branch(s, 0, x, y);
2352
        }
2353
    }
2354
}
2355

    
2356
static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2357
    int x, y;
2358
START_TIMER
2359
    for(y=0; y < b_h+5; y++){
2360
        for(x=0; x < b_w; x++){
2361
            int a0= src[x    ];
2362
            int a1= src[x + 1];
2363
            int a2= src[x + 2];
2364
            int a3= src[x + 3];
2365
            int a4= src[x + 4];
2366
            int a5= src[x + 5];
2367
//            int am= 9*(a1+a2) - (a0+a3);
2368
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2369
//            int am= 18*(a2+a3) - 2*(a1+a4);
2370
//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2371
//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2372

    
2373
//            if(b_w==16) am= 8*(a1+a2);
2374

    
2375
            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2376
            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2377

    
2378
            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2379
            if(am&(~255)) am= ~(am>>31);
2380

    
2381
            tmp[x] = am;
2382

    
2383
/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2384
            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2385
            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2386
            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2387
        }
2388
        tmp += stride;
2389
        src += stride;
2390
    }
2391
    tmp -= (b_h+5)*stride;
2392

    
2393
    for(y=0; y < b_h; y++){
2394
        for(x=0; x < b_w; x++){
2395
            int a0= tmp[x + 0*stride];
2396
            int a1= tmp[x + 1*stride];
2397
            int a2= tmp[x + 2*stride];
2398
            int a3= tmp[x + 3*stride];
2399
            int a4= tmp[x + 4*stride];
2400
            int a5= tmp[x + 5*stride];
2401
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2402
//            int am= 18*(a2+a3) - 2*(a1+a4);
2403
/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2404
            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2405

    
2406
//            if(b_w==16) am= 8*(a1+a2);
2407

    
2408
            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2409
            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2410

    
2411
            if(am&(~255)) am= ~(am>>31);
2412

    
2413
            dst[x] = am;
2414
/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2415
            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2416
            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2417
            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2418
        }
2419
        dst += stride;
2420
        tmp += stride;
2421
    }
2422
STOP_TIMER("mc_block")
2423
}
2424

    
2425
#define mca(dx,dy,b_w)\
2426
static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2427
    uint8_t tmp[stride*(b_w+5)];\
2428
    assert(h==b_w);\
2429
    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2430
}
2431

    
2432
mca( 0, 0,16)
2433
mca( 8, 0,16)
2434
mca( 0, 8,16)
2435
mca( 8, 8,16)
2436
mca( 0, 0,8)
2437
mca( 8, 0,8)
2438
mca( 0, 8,8)
2439
mca( 8, 8,8)
2440

    
2441
static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2442
    if(block->type & BLOCK_INTRA){
2443
        int x, y;
2444
        const int color = block->color[plane_index];
2445
        const int color4= color*0x01010101;
2446
        if(b_w==32){
2447
            for(y=0; y < b_h; y++){
2448
                *(uint32_t*)&dst[0 + y*stride]= color4;
2449
                *(uint32_t*)&dst[4 + y*stride]= color4;
2450
                *(uint32_t*)&dst[8 + y*stride]= color4;
2451
                *(uint32_t*)&dst[12+ y*stride]= color4;
2452
                *(uint32_t*)&dst[16+ y*stride]= color4;
2453
                *(uint32_t*)&dst[20+ y*stride]= color4;
2454
                *(uint32_t*)&dst[24+ y*stride]= color4;
2455
                *(uint32_t*)&dst[28+ y*stride]= color4;
2456
            }
2457
        }else if(b_w==16){
2458
            for(y=0; y < b_h; y++){
2459
                *(uint32_t*)&dst[0 + y*stride]= color4;
2460
                *(uint32_t*)&dst[4 + y*stride]= color4;
2461
                *(uint32_t*)&dst[8 + y*stride]= color4;
2462
                *(uint32_t*)&dst[12+ y*stride]= color4;
2463
            }
2464
        }else if(b_w==8){
2465
            for(y=0; y < b_h; y++){
2466
                *(uint32_t*)&dst[0 + y*stride]= color4;
2467
                *(uint32_t*)&dst[4 + y*stride]= color4;
2468
            }
2469
        }else if(b_w==4){
2470
            for(y=0; y < b_h; y++){
2471
                *(uint32_t*)&dst[0 + y*stride]= color4;
2472
            }
2473
        }else{
2474
            for(y=0; y < b_h; y++){
2475
                for(x=0; x < b_w; x++){
2476
                    dst[x + y*stride]= color;
2477
                }
2478
            }
2479
        }
2480
    }else{
2481
        uint8_t *src= s->last_picture[block->ref].data[plane_index];
2482
        const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2483
        int mx= block->mx*scale;
2484
        int my= block->my*scale;
2485
        const int dx= mx&15;
2486
        const int dy= my&15;
2487
        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2488
        sx += (mx>>4) - 2;
2489
        sy += (my>>4) - 2;
2490
        src += sx + sy*stride;
2491
        if(   (unsigned)sx >= w - b_w - 4
2492
           || (unsigned)sy >= h - b_h - 4){
2493
            ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2494
            src= tmp + MB_SIZE;
2495
        }
2496
//        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2497
//        assert(!(b_w&(b_w-1)));
2498
        assert(b_w>1 && b_h>1);
2499
        assert(tab_index>=0 && tab_index<4 || b_w==32);
2500
        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2501
            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2502
        else if(b_w==32){
2503
            int y;
2504
            for(y=0; y<b_h; y+=16){
2505
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2506
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2507
            }
2508
        }else if(b_w==b_h)
2509
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2510
        else if(b_w==2*b_h){
2511
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2512
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2513
        }else{
2514
            assert(2*b_w==b_h);
2515
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2516
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2517
        }
2518
    }
2519
}
2520

    
2521
void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2522
                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2523
    int y, x;
2524
    DWTELEM * dst;
2525
    for(y=0; y<b_h; y++){
2526
        //FIXME ugly missue of obmc_stride
2527
        uint8_t *obmc1= obmc + y*obmc_stride;
2528
        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2529
        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2530
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2531
        dst = slice_buffer_get_line(sb, src_y + y);
2532
        for(x=0; x<b_w; x++){
2533
            int v=   obmc1[x] * block[3][x + y*src_stride]
2534
                    +obmc2[x] * block[2][x + y*src_stride]
2535
                    +obmc3[x] * block[1][x + y*src_stride]
2536
                    +obmc4[x] * block[0][x + y*src_stride];
2537

    
2538
            v <<= 8 - LOG2_OBMC_MAX;
2539
            if(FRAC_BITS != 8){
2540
                v += 1<<(7 - FRAC_BITS);
2541
                v >>= 8 - FRAC_BITS;
2542
            }
2543
            if(add){
2544
                v += dst[x + src_x];
2545
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2546
                if(v&(~255)) v= ~(v>>31);
2547
                dst8[x + y*src_stride] = v;
2548
            }else{
2549
                dst[x + src_x] -= v;
2550
            }
2551
        }
2552
    }
2553
}
2554

    
2555
//FIXME name clenup (b_w, block_w, b_width stuff)
2556
static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2557
    const int b_width = s->b_width  << s->block_max_depth;
2558
    const int b_height= s->b_height << s->block_max_depth;
2559
    const int b_stride= b_width;
2560
    BlockNode *lt= &s->block[b_x + b_y*b_stride];
2561
    BlockNode *rt= lt+1;
2562
    BlockNode *lb= lt+b_stride;
2563
    BlockNode *rb= lb+1;
2564
    uint8_t *block[4];
2565
    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2566
    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2567
    uint8_t *ptmp;
2568
    int x,y;
2569

    
2570
    if(b_x<0){
2571
        lt= rt;
2572
        lb= rb;
2573
    }else if(b_x + 1 >= b_width){
2574
        rt= lt;
2575
        rb= lb;
2576
    }
2577
    if(b_y<0){
2578
        lt= lb;
2579
        rt= rb;
2580
    }else if(b_y + 1 >= b_height){
2581
        lb= lt;
2582
        rb= rt;
2583
    }
2584

    
2585
    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2586
        obmc -= src_x;
2587
        b_w += src_x;
2588
        if(!sliced && !offset_dst)
2589
            dst -= src_x;
2590
        src_x=0;
2591
    }else if(src_x + b_w > w){
2592
        b_w = w - src_x;
2593
    }
2594
    if(src_y<0){
2595
        obmc -= src_y*obmc_stride;
2596
        b_h += src_y;
2597
        if(!sliced && !offset_dst)
2598
            dst -= src_y*dst_stride;
2599
        src_y=0;
2600
    }else if(src_y + b_h> h){
2601
        b_h = h - src_y;
2602
    }
2603

    
2604
    if(b_w<=0 || b_h<=0) return;
2605

    
2606
assert(src_stride > 2*MB_SIZE + 5);
2607
    if(!sliced && offset_dst)
2608
        dst += src_x + src_y*dst_stride;
2609
    dst8+= src_x + src_y*src_stride;
2610
//    src += src_x + src_y*src_stride;
2611

    
2612
    ptmp= tmp + 3*tmp_step;
2613
    block[0]= ptmp;
2614
    ptmp+=tmp_step;
2615
    pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2616

    
2617
    if(same_block(lt, rt)){
2618
        block[1]= block[0];
2619
    }else{
2620
        block[1]= ptmp;
2621
        ptmp+=tmp_step;
2622
        pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2623
    }
2624

    
2625
    if(same_block(lt, lb)){
2626
        block[2]= block[0];
2627
    }else if(same_block(rt, lb)){
2628
        block[2]= block[1];
2629
    }else{
2630
        block[2]= ptmp;
2631
        ptmp+=tmp_step;
2632
        pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2633
    }
2634

    
2635
    if(same_block(lt, rb) ){
2636
        block[3]= block[0];
2637
    }else if(same_block(rt, rb)){
2638
        block[3]= block[1];
2639
    }else if(same_block(lb, rb)){
2640
        block[3]= block[2];
2641
    }else{
2642
        block[3]= ptmp;
2643
        pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2644
    }
2645
#if 0
2646
    for(y=0; y<b_h; y++){
2647
        for(x=0; x<b_w; x++){
2648
            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2649
            if(add) dst[x + y*dst_stride] += v;
2650
            else    dst[x + y*dst_stride] -= v;
2651
        }
2652
    }
2653
    for(y=0; y<b_h; y++){
2654
        uint8_t *obmc2= obmc + (obmc_stride>>1);
2655
        for(x=0; x<b_w; x++){
2656
            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2657
            if(add) dst[x + y*dst_stride] += v;
2658
            else    dst[x + y*dst_stride] -= v;
2659
        }
2660
    }
2661
    for(y=0; y<b_h; y++){
2662
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2663
        for(x=0; x<b_w; x++){
2664
            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2665
            if(add) dst[x + y*dst_stride] += v;
2666
            else    dst[x + y*dst_stride] -= v;
2667
        }
2668
    }
2669
    for(y=0; y<b_h; y++){
2670
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2671
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2672
        for(x=0; x<b_w; x++){
2673
            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2674
            if(add) dst[x + y*dst_stride] += v;
2675
            else    dst[x + y*dst_stride] -= v;
2676
        }
2677
    }
2678
#else
2679
    if(sliced){
2680
        START_TIMER
2681

    
2682
        s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2683
        STOP_TIMER("inner_add_yblock")
2684
    }else
2685
    for(y=0; y<b_h; y++){
2686
        //FIXME ugly missue of obmc_stride
2687
        uint8_t *obmc1= obmc + y*obmc_stride;
2688
        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2689
        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2690
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2691
        for(x=0; x<b_w; x++){
2692
            int v=   obmc1[x] * block[3][x + y*src_stride]
2693
                    +obmc2[x] * block[2][x + y*src_stride]
2694
                    +obmc3[x] * block[1][x + y*src_stride]
2695
                    +obmc4[x] * block[0][x + y*src_stride];
2696

    
2697
            v <<= 8 - LOG2_OBMC_MAX;
2698
            if(FRAC_BITS != 8){
2699
                v += 1<<(7 - FRAC_BITS);
2700
                v >>= 8 - FRAC_BITS;
2701
            }
2702
            if(add){
2703
                v += dst[x + y*dst_stride];
2704
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2705
                if(v&(~255)) v= ~(v>>31);
2706
                dst8[x + y*src_stride] = v;
2707
            }else{
2708
                dst[x + y*dst_stride] -= v;
2709
            }
2710
        }
2711
    }
2712
#endif
2713
}
2714

    
2715
static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2716
    Plane *p= &s->plane[plane_index];
2717
    const int mb_w= s->b_width  << s->block_max_depth;
2718
    const int mb_h= s->b_height << s->block_max_depth;
2719
    int x, y, mb_x;
2720
    int block_size = MB_SIZE >> s->block_max_depth;
2721
    int block_w    = plane_index ? block_size/2 : block_size;
2722
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2723
    int obmc_stride= plane_index ? block_size : 2*block_size;
2724
    int ref_stride= s->current_picture.linesize[plane_index];
2725
    uint8_t *dst8= s->current_picture.data[plane_index];
2726
    int w= p->width;
2727
    int h= p->height;
2728
    START_TIMER
2729

    
2730
    if(s->keyframe || (s->avctx->debug&512)){
2731
        if(mb_y==mb_h)
2732
            return;
2733

    
2734
        if(add){
2735
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2736
            {
2737
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2738
                DWTELEM * line = sb->line[y];
2739
                for(x=0; x<w; x++)
2740
                {
2741
//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2742
                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2743
                    v >>= FRAC_BITS;
2744
                    if(v&(~255)) v= ~(v>>31);
2745
                    dst8[x + y*ref_stride]= v;
2746
                }
2747
            }
2748
        }else{
2749
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2750
            {
2751
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2752
                DWTELEM * line = sb->line[y];
2753
                for(x=0; x<w; x++)
2754
                {
2755
                    line[x] -= 128 << FRAC_BITS;
2756
//                    buf[x + y*w]-= 128<<FRAC_BITS;
2757
                }
2758
            }
2759
        }
2760

    
2761
        return;
2762
    }
2763

    
2764
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2765
            START_TIMER
2766

    
2767
            add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2768
                       block_w*mb_x - block_w/2,
2769
                       block_w*mb_y - block_w/2,
2770
                       block_w, block_w,
2771
                       w, h,
2772
                       w, ref_stride, obmc_stride,
2773
                       mb_x - 1, mb_y - 1,
2774
                       add, 0, plane_index);
2775

    
2776
            STOP_TIMER("add_yblock")
2777
        }
2778

    
2779
    STOP_TIMER("predict_slice")
2780
}
2781

    
2782
static av_always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2783
    Plane *p= &s->plane[plane_index];
2784
    const int mb_w= s->b_width  << s->block_max_depth;
2785
    const int mb_h= s->b_height << s->block_max_depth;
2786
    int x, y, mb_x;
2787
    int block_size = MB_SIZE >> s->block_max_depth;
2788
    int block_w    = plane_index ? block_size/2 : block_size;
2789
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2790
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2791
    int ref_stride= s->current_picture.linesize[plane_index];
2792
    uint8_t *dst8= s->current_picture.data[plane_index];
2793
    int w= p->width;
2794
    int h= p->height;
2795
    START_TIMER
2796

    
2797
    if(s->keyframe || (s->avctx->debug&512)){
2798
        if(mb_y==mb_h)
2799
            return;
2800

    
2801
        if(add){
2802
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2803
                for(x=0; x<w; x++){
2804
                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2805
                    v >>= FRAC_BITS;
2806
                    if(v&(~255)) v= ~(v>>31);
2807
                    dst8[x + y*ref_stride]= v;
2808
                }
2809
            }
2810
        }else{
2811
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2812
                for(x=0; x<w; x++){
2813
                    buf[x + y*w]-= 128<<FRAC_BITS;
2814
                }
2815
            }
2816
        }
2817

    
2818
        return;
2819
    }
2820

    
2821
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2822
            START_TIMER
2823

    
2824
            add_yblock(s, 0, NULL, buf, dst8, obmc,
2825
                       block_w*mb_x - block_w/2,
2826
                       block_w*mb_y - block_w/2,
2827
                       block_w, block_w,
2828
                       w, h,
2829
                       w, ref_stride, obmc_stride,
2830
                       mb_x - 1, mb_y - 1,
2831
                       add, 1, plane_index);
2832

    
2833
            STOP_TIMER("add_yblock")
2834
        }
2835

    
2836
    STOP_TIMER("predict_slice")
2837
}
2838

    
2839
static av_always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2840
    const int mb_h= s->b_height << s->block_max_depth;
2841
    int mb_y;
2842
    for(mb_y=0; mb_y<=mb_h; mb_y++)
2843
        predict_slice(s, buf, plane_index, add, mb_y);
2844
}
2845

    
2846
static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2847
    int i, x2, y2;
2848
    Plane *p= &s->plane[plane_index];
2849
    const int block_size = MB_SIZE >> s->block_max_depth;
2850
    const int block_w    = plane_index ? block_size/2 : block_size;
2851
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2852
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2853
    const int ref_stride= s->current_picture.linesize[plane_index];
2854
    uint8_t *src= s-> input_picture.data[plane_index];
2855
    DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2856
    const int b_stride = s->b_width << s->block_max_depth;
2857
    const int w= p->width;
2858
    const int h= p->height;
2859
    int index= mb_x + mb_y*b_stride;
2860
    BlockNode *b= &s->block[index];
2861
    BlockNode backup= *b;
2862
    int ab=0;
2863
    int aa=0;
2864

    
2865
    b->type|= BLOCK_INTRA;
2866
    b->color[plane_index]= 0;
2867
    memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2868

    
2869
    for(i=0; i<4; i++){
2870
        int mb_x2= mb_x + (i &1) - 1;
2871
        int mb_y2= mb_y + (i>>1) - 1;
2872
        int x= block_w*mb_x2 + block_w/2;
2873
        int y= block_w*mb_y2 + block_w/2;
2874

    
2875
        add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2876
                    x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2877

    
2878
        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2879
            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2880
                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2881
                int obmc_v= obmc[index];
2882
                int d;
2883
                if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2884
                if(x<0) obmc_v += obmc[index + block_w];
2885
                if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2886
                if(x+block_w>w) obmc_v += obmc[index - block_w];
2887
                //FIXME precalc this or simplify it somehow else
2888

    
2889
                d = -dst[index] + (1<<(FRAC_BITS-1));
2890
                dst[index] = d;
2891
                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2892
                aa += obmc_v * obmc_v; //FIXME precalclate this
2893
            }
2894
        }
2895
    }
2896
    *b= backup;
2897

    
2898
    return clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
2899
}
2900

    
2901
static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2902
    const int b_stride = s->b_width << s->block_max_depth;
2903
    const int b_height = s->b_height<< s->block_max_depth;
2904
    int index= x + y*b_stride;
2905
    BlockNode *b     = &s->block[index];
2906
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2907
    BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2908
    BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2909
    BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2910
    int dmx, dmy;
2911
//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2912
//  int my_context= av_log2(2*FFABS(left->my - top->my));
2913

    
2914
    if(x<0 || x>=b_stride || y>=b_height)
2915
        return 0;
2916
/*
2917
1            0      0
2918
01X          1-2    1
2919
001XX        3-6    2-3
2920
0001XXX      7-14   4-7
2921
00001XXXX   15-30   8-15
2922
*/
2923
//FIXME try accurate rate
2924
//FIXME intra and inter predictors if surrounding blocks arent the same type
2925
    if(b->type & BLOCK_INTRA){
2926
        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2927
                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
2928
                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
2929
    }else{
2930
        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2931
        dmx-= b->mx;
2932
        dmy-= b->my;
2933
        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2934
                    + av_log2(2*FFABS(dmy))
2935
                    + av_log2(2*b->ref));
2936
    }
2937
}
2938

    
2939
static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2940
    Plane *p= &s->plane[plane_index];
2941
    const int block_size = MB_SIZE >> s->block_max_depth;
2942
    const int block_w    = plane_index ? block_size/2 : block_size;
2943
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2944
    const int ref_stride= s->current_picture.linesize[plane_index];
2945
    uint8_t *dst= s->current_picture.data[plane_index];
2946
    uint8_t *src= s->  input_picture.data[plane_index];
2947
    DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2948
    uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2949
    uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
2950
    const int b_stride = s->b_width << s->block_max_depth;
2951
    const int b_height = s->b_height<< s->block_max_depth;
2952
    const int w= p->width;
2953
    const int h= p->height;
2954
    int distortion;
2955
    int rate= 0;
2956
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2957
    int sx= block_w*mb_x - block_w/2;
2958
    int sy= block_w*mb_y - block_w/2;
2959
    int x0= FFMAX(0,-sx);
2960
    int y0= FFMAX(0,-sy);
2961
    int x1= FFMIN(block_w*2, w-sx);
2962
    int y1= FFMIN(block_w*2, h-sy);
2963
    int i,x,y;
2964

    
2965
    pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2966

    
2967
    for(y=y0; y<y1; y++){
2968
        const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2969
        const DWTELEM *pred1 = pred + y*obmc_stride;
2970
        uint8_t *cur1 = cur + y*ref_stride;
2971
        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2972
        for(x=x0; x<x1; x++){
2973
            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2974
            v = (v + pred1[x]) >> FRAC_BITS;
2975
            if(v&(~255)) v= ~(v>>31);
2976
            dst1[x] = v;
2977
        }
2978
    }
2979

    
2980
    /* copy the regions where obmc[] = (uint8_t)256 */
2981
    if(LOG2_OBMC_MAX == 8
2982
        && (mb_x == 0 || mb_x == b_stride-1)
2983
        && (mb_y == 0 || mb_y == b_height-1)){
2984
        if(mb_x == 0)
2985
            x1 = block_w;
2986
        else
2987
            x0 = block_w;
2988
        if(mb_y == 0)
2989
            y1 = block_w;
2990
        else
2991
            y0 = block_w;
2992
        for(y=y0; y<y1; y++)
2993
            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2994
    }
2995

    
2996
    if(block_w==16){
2997
        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
2998
        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
2999
        /* FIXME cmps overlap but don't cover the wavelet's whole support,
3000
         * so improving the score of one block is not strictly guaranteed to
3001
         * improve the score of the whole frame, so iterative motion est
3002
         * doesn't always converge. */
3003
        if(s->avctx->me_cmp == FF_CMP_W97)
3004
            distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3005
        else if(s->avctx->me_cmp == FF_CMP_W53)
3006
            distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3007
        else{
3008
            distortion = 0;
3009
            for(i=0; i<4; i++){
3010
                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3011
                distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3012
            }
3013
        }
3014
    }else{
3015
        assert(block_w==8);
3016
        distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3017
    }
3018

    
3019
    if(plane_index==0){
3020
        for(i=0; i<4; i++){
3021
/* ..RRr
3022
 * .RXx.
3023
 * rxx..
3024
 */
3025
            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3026
        }
3027
        if(mb_x == b_stride-2)
3028
            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3029
    }
3030
    return distortion + rate*penalty_factor;
3031
}
3032

    
3033
static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3034
    int i, y2;
3035
    Plane *p= &s->plane[plane_index];
3036
    const int block_size = MB_SIZE >> s->block_max_depth;
3037
    const int block_w    = plane_index ? block_size/2 : block_size;
3038
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3039
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3040
    const int ref_stride= s->current_picture.linesize[plane_index];
3041
    uint8_t *dst= s->current_picture.data[plane_index];
3042
    uint8_t *src= s-> input_picture.data[plane_index];
3043
    static const DWTELEM zero_dst[4096]; //FIXME
3044
    const int b_stride = s->b_width << s->block_max_depth;
3045
    const int w= p->width;
3046
    const int h= p->height;
3047
    int distortion= 0;
3048
    int rate= 0;
3049
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3050

    
3051
    for(i=0; i<9; i++){
3052
        int mb_x2= mb_x + (i%3) - 1;
3053
        int mb_y2= mb_y + (i/3) - 1;
3054
        int x= block_w*mb_x2 + block_w/2;
3055
        int y= block_w*mb_y2 + block_w/2;
3056

    
3057
        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
3058
                   x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3059

    
3060
        //FIXME find a cleaner/simpler way to skip the outside stuff
3061
        for(y2= y; y2<0; y2++)
3062
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3063
        for(y2= h; y2<y+block_w; y2++)
3064
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3065
        if(x<0){
3066
            for(y2= y; y2<y+block_w; y2++)
3067
                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3068
        }
3069
        if(x+block_w > w){
3070
            for(y2= y; y2<y+block_w; y2++)
3071
                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3072
        }
3073

    
3074
        assert(block_w== 8 || block_w==16);
3075
        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3076
    }
3077

    
3078
    if(plane_index==0){
3079
        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3080
        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3081

    
3082
/* ..RRRr
3083
 * .RXXx.
3084
 * .RXXx.
3085
 * rxxx.
3086
 */
3087
        if(merged)
3088
            rate = get_block_bits(s, mb_x, mb_y, 2);
3089
        for(i=merged?4:0; i<9; i++){
3090
            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3091
            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3092
        }
3093
    }
3094
    return distortion + rate*penalty_factor;
3095
}
3096

    
3097
static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3098
    const int b_stride= s->b_width << s->block_max_depth;
3099
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3100
    BlockNode backup= *block;
3101
    int rd, index, value;
3102

    
3103
    assert(mb_x>=0 && mb_y>=0);
3104
    assert(mb_x<b_stride);
3105

    
3106
    if(intra){
3107
        block->color[0] = p[0];
3108
        block->color[1] = p[1];
3109
        block->color[2] = p[2];
3110
        block->type |= BLOCK_INTRA;
3111
    }else{
3112
        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3113
        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3114
        if(s->me_cache[index] == value)
3115
            return 0;
3116
        s->me_cache[index]= value;
3117

    
3118
        block->mx= p[0];
3119
        block->my= p[1];
3120
        block->type &= ~BLOCK_INTRA;
3121
    }
3122

    
3123
    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3124

    
3125
//FIXME chroma
3126
    if(rd < *best_rd){
3127
        *best_rd= rd;
3128
        return 1;
3129
    }else{
3130
        *block= backup;
3131
        return 0;
3132
    }
3133
}
3134

    
3135
/* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3136
static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3137
    int p[2] = {p0, p1};
3138
    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3139
}
3140

    
3141
static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3142
    const int b_stride= s->b_width << s->block_max_depth;
3143
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3144
    BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3145
    int rd, index, value;
3146

    
3147
    assert(mb_x>=0 && mb_y>=0);
3148
    assert(mb_x<b_stride);
3149
    assert(((mb_x|mb_y)&1) == 0);
3150

    
3151
    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3152
    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3153
    if(s->me_cache[index] == value)
3154
        return 0;
3155
    s->me_cache[index]= value;
3156

    
3157
    block->mx= p0;
3158
    block->my= p1;
3159
    block->ref= ref;
3160
    block->type &= ~BLOCK_INTRA;
3161
    block[1]= block[b_stride]= block[b_stride+1]= *block;
3162

    
3163
    rd= get_4block_rd(s, mb_x, mb_y, 0);
3164

    
3165
//FIXME chroma
3166
    if(rd < *best_rd){
3167
        *best_rd= rd;
3168
        return 1;
3169
    }else{
3170
        block[0]= backup[0];
3171
        block[1]= backup[1];
3172
        block[b_stride]= backup[2];
3173
        block[b_stride+1]= backup[3];
3174
        return 0;
3175
    }
3176
}
3177

    
3178
static void iterative_me(SnowContext *s){
3179
    int pass, mb_x, mb_y;
3180
    const int b_width = s->b_width  << s->block_max_depth;
3181
    const int b_height= s->b_height << s->block_max_depth;
3182
    const int b_stride= b_width;
3183
    int color[3];
3184

    
3185
    {
3186
        RangeCoder r = s->c;
3187
        uint8_t state[sizeof(s->block_state)];
3188
        memcpy(state, s->block_state, sizeof(s->block_state));
3189
        for(mb_y= 0; mb_y<s->b_height; mb_y++)
3190
            for(mb_x= 0; mb_x<s->b_width; mb_x++)
3191
                encode_q_branch(s, 0, mb_x, mb_y);
3192
        s->c = r;
3193
        memcpy(s->block_state, state, sizeof(s->block_state));
3194
    }
3195

    
3196
    for(pass=0; pass<25; pass++){
3197
        int change= 0;
3198

    
3199
        for(mb_y= 0; mb_y<b_height; mb_y++){
3200
            for(mb_x= 0; mb_x<b_width; mb_x++){
3201
                int dia_change, i, j, ref;
3202
                int best_rd= INT_MAX, ref_rd;
3203
                BlockNode backup, ref_b;
3204
                const int index= mb_x + mb_y * b_stride;
3205
                BlockNode *block= &s->block[index];
3206
                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3207
                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3208
                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3209
                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3210
                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3211
                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3212
                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3213
                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3214
                const int b_w= (MB_SIZE >> s->block_max_depth);
3215
                uint8_t obmc_edged[b_w*2][b_w*2];
3216

    
3217
                if(pass && (block->type & BLOCK_OPT))
3218
                    continue;
3219
                block->type |= BLOCK_OPT;
3220

    
3221
                backup= *block;
3222

    
3223
                if(!s->me_cache_generation)
3224
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3225
                s->me_cache_generation += 1<<22;
3226

    
3227
                //FIXME precalc
3228
                {
3229
                    int x, y;
3230
                    memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3231
                    if(mb_x==0)
3232
                        for(y=0; y<b_w*2; y++)
3233
                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3234
                    if(mb_x==b_stride-1)
3235
                        for(y=0; y<b_w*2; y++)
3236
                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3237
                    if(mb_y==0){
3238
                        for(x=0; x<b_w*2; x++)
3239
                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
3240
                        for(y=1; y<b_w; y++)
3241
                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3242
                    }
3243
                    if(mb_y==b_height-1){
3244
                        for(x=0; x<b_w*2; x++)
3245
                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3246
                        for(y=b_w; y<b_w*2-1; y++)
3247
                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3248
                    }
3249
                }
3250

    
3251
                //skip stuff outside the picture
3252
                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3253
                {
3254
                    uint8_t *src= s->  input_picture.data[0];
3255
                    uint8_t *dst= s->current_picture.data[0];
3256
                    const int stride= s->current_picture.linesize[0];
3257
                    const int block_w= MB_SIZE >> s->block_max_depth;
3258
                    const int sx= block_w*mb_x - block_w/2;
3259
                    const int sy= block_w*mb_y - block_w/2;
3260
                    const int w= s->plane[0].width;
3261
                    const int h= s->plane[0].height;
3262
                    int y;
3263

    
3264
                    for(y=sy; y<0; y++)
3265
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3266
                    for(y=h; y<sy+block_w*2; y++)
3267
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3268
                    if(sx<0){
3269
                        for(y=sy; y<sy+block_w*2; y++)
3270
                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3271
                    }
3272
                    if(sx+block_w*2 > w){
3273
                        for(y=sy; y<sy+block_w*2; y++)
3274
                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3275
                    }
3276
                }
3277

    
3278
                // intra(black) = neighbors' contribution to the current block
3279
                for(i=0; i<3; i++)
3280
                    color[i]= get_dc(s, mb_x, mb_y, i);
3281

    
3282
                // get previous score (cant be cached due to OBMC)
3283
                if(pass > 0 && (block->type&BLOCK_INTRA)){
3284
                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
3285
                    check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3286
                }else
3287
                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3288

    
3289
                ref_b= *block;
3290
                ref_rd= best_rd;
3291
                for(ref=0; ref < s->ref_frames; ref++){
3292
                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3293
                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3294
                        continue;
3295
                    block->ref= ref;
3296
                    best_rd= INT_MAX;
3297

    
3298
                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3299
                    check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3300
                    if(tb)
3301
                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3302
                    if(lb)
3303
                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3304
                    if(rb)
3305
                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3306
                    if(bb)
3307
                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3308

    
3309
                    /* fullpel ME */
3310
                    //FIXME avoid subpel interpol / round to nearest integer
3311
                    do{
3312
                        dia_change=0;
3313
                        for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3314
                            for(j=0; j<i; j++){
3315
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3316
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3317
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3318
                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3319
                            }
3320
                        }
3321
                    }while(dia_change);
3322
                    /* subpel ME */
3323
                    do{
3324
                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3325
                        dia_change=0;
3326
                        for(i=0; i<8; i++)
3327
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3328
                    }while(dia_change);
3329
                    //FIXME or try the standard 2 pass qpel or similar
3330

    
3331
                    mvr[0][0]= block->mx;
3332
                    mvr[0][1]= block->my;
3333
                    if(ref_rd > best_rd){
3334
                        ref_rd= best_rd;
3335
                        ref_b= *block;
3336
                    }
3337
                }
3338
                best_rd= ref_rd;
3339
                *block= ref_b;
3340
#if 1
3341
                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3342
                //FIXME RD style color selection
3343
#endif
3344
                if(!same_block(block, &backup)){
3345
                    if(tb ) tb ->type &= ~BLOCK_OPT;
3346
                    if(lb ) lb ->type &= ~BLOCK_OPT;
3347
                    if(rb ) rb ->type &= ~BLOCK_OPT;
3348
                    if(bb ) bb ->type &= ~BLOCK_OPT;
3349
                    if(tlb) tlb->type &= ~BLOCK_OPT;
3350
                    if(trb) trb->type &= ~BLOCK_OPT;
3351
                    if(blb) blb->type &= ~BLOCK_OPT;
3352
                    if(brb) brb->type &= ~BLOCK_OPT;
3353
                    change ++;
3354
                }
3355
            }
3356
        }
3357
        av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3358
        if(!change)
3359
            break;
3360
    }
3361

    
3362
    if(s->block_max_depth == 1){
3363
        int change= 0;
3364
        for(mb_y= 0; mb_y<b_height; mb_y+=2){
3365
            for(mb_x= 0; mb_x<b_width; mb_x+=2){
3366
                int i;
3367
                int best_rd, init_rd;
3368
                const int index= mb_x + mb_y * b_stride;
3369
                BlockNode *b[4];
3370

    
3371
                b[0]= &s->block[index];
3372
                b[1]= b[0]+1;
3373
                b[2]= b[0]+b_stride;
3374
                b[3]= b[2]+1;
3375
                if(same_block(b[0], b[1]) &&
3376
                   same_block(b[0], b[2]) &&
3377
                   same_block(b[0], b[3]))
3378
                    continue;
3379

    
3380
                if(!s->me_cache_generation)
3381
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3382
                s->me_cache_generation += 1<<22;
3383

    
3384
                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3385

    
3386
                //FIXME more multiref search?
3387
                check_4block_inter(s, mb_x, mb_y,
3388
                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3389
                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3390

    
3391
                for(i=0; i<4; i++)
3392
                    if(!(b[i]->type&BLOCK_INTRA))
3393
                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3394

    
3395
                if(init_rd != best_rd)
3396
                    change++;
3397
            }
3398
        }
3399
        av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3400
    }
3401
}
3402

    
3403
static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3404
    const int level= b->level;
3405
    const int w= b->width;
3406
    const int h= b->height;
3407
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3408
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3409
    int x,y, thres1, thres2;
3410
//    START_TIMER
3411

    
3412
    if(s->qlog == LOSSLESS_QLOG) return;
3413

    
3414
    bias= bias ? 0 : (3*qmul)>>3;
3415
    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3416
    thres2= 2*thres1;
3417

    
3418
    if(!bias){
3419
        for(y=0; y<h; y++){
3420
            for(x=0; x<w; x++){
3421
                int i= src[x + y*stride];
3422

    
3423
                if((unsigned)(i+thres1) > thres2){
3424
                    if(i>=0){
3425
                        i<<= QEXPSHIFT;
3426
                        i/= qmul; //FIXME optimize
3427
                        src[x + y*stride]=  i;
3428
                    }else{
3429
                        i= -i;
3430
                        i<<= QEXPSHIFT;
3431
                        i/= qmul; //FIXME optimize
3432
                        src[x + y*stride]= -i;
3433
                    }
3434
                }else
3435
                    src[x + y*stride]= 0;
3436
            }
3437
        }
3438
    }else{
3439
        for(y=0; y<h; y++){
3440
            for(x=0; x<w; x++){
3441
                int i= src[x + y*stride];
3442

    
3443
                if((unsigned)(i+thres1) > thres2){
3444
                    if(i>=0){
3445
                        i<<= QEXPSHIFT;
3446
                        i= (i + bias) / qmul; //FIXME optimize
3447
                        src[x + y*stride]=  i;
3448
                    }else{
3449
                        i= -i;
3450
                        i<<= QEXPSHIFT;
3451
                        i= (i + bias) / qmul; //FIXME optimize
3452
                        src[x + y*stride]= -i;
3453
                    }
3454
                }else
3455
                    src[x + y*stride]= 0;
3456
            }
3457
        }
3458
    }
3459
    if(level+1 == s->spatial_decomposition_count){
3460
//        STOP_TIMER("quantize")
3461
    }
3462
}
3463

    
3464
static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3465
    const int w= b->width;
3466
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3467
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3468
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3469
    int x,y;
3470
    START_TIMER
3471

    
3472
    if(s->qlog == LOSSLESS_QLOG) return;
3473

    
3474
    for(y=start_y; y<end_y; y++){
3475
//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3476
        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3477
        for(x=0; x<w; x++){
3478
            int i= line[x];
3479
            if(i<0){
3480
                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3481
            }else if(i>0){
3482
                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3483
            }
3484
        }
3485
    }
3486
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3487
        STOP_TIMER("dquant")
3488
    }
3489
}
3490

    
3491
static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3492
    const int w= b->width;
3493
    const int h= b->height;
3494
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3495
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3496
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3497
    int x,y;
3498
    START_TIMER
3499

    
3500
    if(s->qlog == LOSSLESS_QLOG) return;
3501

    
3502
    for(y=0; y<h; y++){
3503
        for(x=0; x<w; x++){
3504
            int i= src[x + y*stride];
3505
            if(i<0){
3506
                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3507
            }else if(i>0){
3508
                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3509
            }
3510
        }
3511
    }
3512
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3513
        STOP_TIMER("dquant")
3514
    }
3515
}
3516

    
3517
static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3518
    const int w= b->width;
3519
    const int h= b->height;
3520
    int x,y;
3521

    
3522
    for(y=h-1; y>=0; y--){
3523
        for(x=w-1; x>=0; x--){
3524
            int i= x + y*stride;
3525

    
3526
            if(x){
3527
                if(use_median){
3528
                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3529
                    else  src[i] -= src[i - 1];
3530
                }else{
3531
                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3532
                    else  src[i] -= src[i - 1];
3533
                }
3534
            }else{
3535
                if(y) src[i] -= src[i - stride];
3536
            }
3537
        }
3538
    }
3539
}
3540

    
3541
static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3542
    const int w= b->width;
3543
    int x,y;
3544

    
3545
//    START_TIMER
3546

    
3547
    DWTELEM * line;
3548
    DWTELEM * prev;
3549

    
3550
    if (start_y != 0)
3551
        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3552

    
3553
    for(y=start_y; y<end_y; y++){
3554
        prev = line;
3555
//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3556
        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3557
        for(x=0; x<w; x++){
3558
            if(x){
3559
                if(use_median){
3560
                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3561
                    else  line[x] += line[x - 1];
3562
                }else{
3563
                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3564
                    else  line[x] += line[x - 1];
3565
                }
3566
            }else{
3567
                if(y) line[x] += prev[x];
3568
            }
3569
        }
3570
    }
3571

    
3572
//    STOP_TIMER("correlate")
3573
}
3574

    
3575
static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3576
    const int w= b->width;
3577
    const int h= b->height;
3578
    int x,y;
3579

    
3580
    for(y=0; y<h; y++){
3581
        for(x=0; x<w; x++){
3582
            int i= x + y*stride;
3583

    
3584
            if(x){
3585
                if(use_median){
3586
                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3587
                    else  src[i] += src[i - 1];
3588
                }else{
3589
                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3590
                    else  src[i] += src[i - 1];
3591
                }
3592
            }else{
3593
                if(y) src[i] += src[i - stride];
3594
            }
3595
        }
3596
    }
3597
}
3598

    
3599
static void encode_header(SnowContext *s){
3600
    int plane_index, level, orientation;
3601
    uint8_t kstate[32];
3602

    
3603
    memset(kstate, MID_STATE, sizeof(kstate));
3604

    
3605
    put_rac(&s->c, kstate, s->keyframe);
3606
    if(s->keyframe || s->always_reset)
3607
        reset_contexts(s);
3608
    if(s->keyframe){
3609
        put_symbol(&s->c, s->header_state, s->version, 0);
3610
        put_rac(&s->c, s->header_state, s->always_reset);
3611
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3612
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3613
        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3614
        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3615
        put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3616
        put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3617
        put_rac(&s->c, s->header_state, s->spatial_scalability);
3618
//        put_rac(&s->c, s->header_state, s->rate_scalability);
3619
        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3620

    
3621
        for(plane_index=0; plane_index<2; plane_index++){
3622
            for(level=0; level<s->spatial_decomposition_count; level++){
3623
                for(orientation=level ? 1:0; orientation<4; orientation++){
3624
                    if(orientation==2) continue;
3625
                    put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3626
                }
3627
            }
3628
        }
3629
    }
3630
    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type, 0);
3631
    put_symbol(&s->c, s->header_state, s->qlog, 1);
3632
    put_symbol(&s->c, s->header_state, s->mv_scale, 0);
3633
    put_symbol(&s->c, s->header_state, s->qbias, 1);
3634
    put_symbol(&s->c, s->header_state, s->block_max_depth, 0);
3635
}
3636

    
3637
static int decode_header(SnowContext *s){
3638
    int plane_index, level, orientation;
3639
    uint8_t kstate[32];
3640

    
3641
    memset(kstate, MID_STATE, sizeof(kstate));
3642

    
3643
    s->keyframe= get_rac(&s->c, kstate);
3644
    if(s->keyframe || s->always_reset)
3645
        reset_contexts(s);
3646
    if(s->keyframe){
3647
        s->version= get_symbol(&s->c, s->header_state, 0);
3648
        if(s->version>0){
3649
            av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3650
            return -1;
3651
        }
3652
        s->always_reset= get_rac(&s->c, s->header_state);
3653
        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3654
        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3655
        s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3656
        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3657
        s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3658
        s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3659
        s->spatial_scalability= get_rac(&s->c, s->header_state);
3660
//        s->rate_scalability= get_rac(&s->c, s->header_state);
3661
        s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3662

    
3663
        for(plane_index=0; plane_index<3; plane_index++){
3664
            for(level=0; level<s->spatial_decomposition_count; level++){
3665
                for(orientation=level ? 1:0; orientation<4; orientation++){
3666
                    int q;
3667
                    if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3668
                    else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3669
                    else                    q= get_symbol(&s->c, s->header_state, 1);
3670
                    s->plane[plane_index].band[level][orientation].qlog= q;
3671
                }
3672
            }
3673
        }
3674
    }
3675

    
3676
    s->spatial_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3677
    if(s->spatial_decomposition_type > 2){
3678
        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3679
        return -1;
3680
    }
3681

    
3682
    s->qlog= get_symbol(&s->c, s->header_state, 1);
3683
    s->mv_scale= get_symbol(&s->c, s->header_state, 0);
3684
    s->qbias= get_symbol(&s->c, s->header_state, 1);
3685
    s->block_max_depth= get_symbol(&s->c, s->header_state, 0);
3686
    if(s->block_max_depth > 1 || s->block_max_depth < 0){
3687
        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3688
        s->block_max_depth= 0;
3689
        return -1;
3690
    }
3691

    
3692
    return 0;
3693
}
3694

    
3695
static void init_qexp(void){
3696
    int i;
3697
    double v=128;
3698

    
3699
    for(i=0; i<QROOT; i++){
3700
        qexp[i]= lrintf(v);
3701
        v *= pow(2, 1.0 / QROOT);
3702
    }
3703
}
3704

    
3705
static int common_init(AVCodecContext *avctx){
3706
    SnowContext *s = avctx->priv_data;
3707
    int width, height;
3708
    int level, orientation, plane_index, dec;
3709
    int i, j;
3710

    
3711
    s->avctx= avctx;
3712

    
3713
    dsputil_init(&s->dsp, avctx);
3714

    
3715
#define mcf(dx,dy)\
3716
    s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3717
    s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3718
        s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3719
    s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3720
    s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3721
        s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3722

    
3723
    mcf( 0, 0)
3724
    mcf( 4, 0)
3725
    mcf( 8, 0)
3726
    mcf(12, 0)
3727
    mcf( 0, 4)
3728
    mcf( 4, 4)
3729
    mcf( 8, 4)
3730
    mcf(12, 4)
3731
    mcf( 0, 8)
3732
    mcf( 4, 8)
3733
    mcf( 8, 8)
3734
    mcf(12, 8)
3735
    mcf( 0,12)
3736
    mcf( 4,12)
3737
    mcf( 8,12)
3738
    mcf(12,12)
3739

    
3740
#define mcfh(dx,dy)\
3741
    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3742
    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3743
        mc_block_hpel ## dx ## dy ## 16;\
3744
    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3745
    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3746
        mc_block_hpel ## dx ## dy ## 8;
3747

    
3748
    mcfh(0, 0)
3749
    mcfh(8, 0)
3750
    mcfh(0, 8)
3751
    mcfh(8, 8)
3752

    
3753
    if(!qexp[0])
3754
        init_qexp();
3755

    
3756
    dec= s->spatial_decomposition_count= 5;
3757
    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3758

    
3759
    s->chroma_h_shift= 1; //FIXME XXX
3760
    s->chroma_v_shift= 1;
3761

    
3762
//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3763

    
3764
    width= s->avctx->width;
3765
    height= s->avctx->height;
3766

    
3767
    s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3768

    
3769
    s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3770
    s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3771

    
3772
    for(plane_index=0; plane_index<3; plane_index++){
3773
        int w= s->avctx->width;
3774
        int h= s->avctx->height;
3775

    
3776
        if(plane_index){
3777
            w>>= s->chroma_h_shift;
3778
            h>>= s->chroma_v_shift;
3779
        }
3780
        s->plane[plane_index].width = w;
3781
        s->plane[plane_index].height= h;
3782
//av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3783
        for(level=s->spatial_decomposition_count-1; level>=0; level--){
3784
            for(orientation=level ? 1 : 0; orientation<4; orientation++){
3785
                SubBand *b= &s->plane[plane_index].band[level][orientation];
3786

    
3787
                b->buf= s->spatial_dwt_buffer;
3788
                b->level= level;
3789
                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3790
                b->width = (w + !(orientation&1))>>1;
3791
                b->height= (h + !(orientation>1))>>1;
3792

    
3793
                b->stride_line = 1 << (s->spatial_decomposition_count - level);
3794
                b->buf_x_offset = 0;
3795
                b->buf_y_offset = 0;
3796

    
3797
                if(orientation&1){
3798
                    b->buf += (w+1)>>1;
3799
                    b->buf_x_offset = (w+1)>>1;
3800
                }
3801
                if(orientation>1){
3802
                    b->buf += b->stride>>1;
3803
                    b->buf_y_offset = b->stride_line >> 1;
3804
                }
3805

    
3806
                if(level)
3807
                    b->parent= &s->plane[plane_index].band[level-1][orientation];
3808
                b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3809
            }
3810
            w= (w+1)>>1;
3811
            h= (h+1)>>1;
3812
        }
3813
    }
3814

    
3815
    for(i=0; i<MAX_REF_FRAMES; i++)
3816
        for(j=0; j<MAX_REF_FRAMES; j++)
3817
            scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3818

    
3819
    reset_contexts(s);
3820
/*
3821
    width= s->width= avctx->width;
3822
    height= s->height= avctx->height;
3823

3824
    assert(width && height);
3825
*/
3826
    s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3827

    
3828
    return 0;
3829
}
3830

    
3831
static int qscale2qlog(int qscale){
3832
    return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3833
           + 61*QROOT/8; //<64 >60
3834
}
3835

    
3836
static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3837
{
3838
    /* estimate the frame's complexity as a sum of weighted dwt coefs.
3839
     * FIXME we know exact mv bits at this point,
3840
     * but ratecontrol isn't set up to include them. */
3841
    uint32_t coef_sum= 0;
3842
    int level, orientation, delta_qlog;
3843

    
3844
    for(level=0; level<s->spatial_decomposition_count; level++){
3845
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3846
            SubBand *b= &s->plane[0].band[level][orientation];
3847
            DWTELEM *buf= b->buf;
3848
            const int w= b->width;
3849
            const int h= b->height;
3850
            const int stride= b->stride;
3851
            const int qlog= clip(2*QROOT + b->qlog, 0, QROOT*16);
3852
            const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3853
            const int qdiv= (1<<16)/qmul;
3854
            int x, y;
3855
            if(orientation==0)
3856
                decorrelate(s, b, buf, stride, 1, 0);
3857
            for(y=0; y<h; y++)
3858
                for(x=0; x<w; x++)
3859
                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3860
            if(orientation==0)
3861
                correlate(s, b, buf, stride, 1, 0);
3862
        }
3863
    }
3864

    
3865
    /* ugly, ratecontrol just takes a sqrt again */
3866
    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3867
    assert(coef_sum < INT_MAX);
3868

    
3869
    if(pict->pict_type == I_TYPE){
3870
        s->m.current_picture.mb_var_sum= coef_sum;
3871
        s->m.current_picture.mc_mb_var_sum= 0;
3872
    }else{
3873
        s->m.current_picture.mc_mb_var_sum= coef_sum;
3874
        s->m.current_picture.mb_var_sum= 0;
3875
    }
3876

    
3877
    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3878
    if (pict->quality < 0)
3879
        return INT_MIN;
3880
    s->lambda= pict->quality * 3/2;
3881
    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3882
    s->qlog+= delta_qlog;
3883
    return delta_qlog;
3884
}
3885

    
3886
static void calculate_vissual_weight(SnowContext *s, Plane *p){
3887
    int width = p->width;
3888
    int height= p->height;
3889
    int level, orientation, x, y;
3890

    
3891
    for(level=0; level<s->spatial_decomposition_count; level++){
3892
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3893
            SubBand *b= &p->band[level][orientation];
3894
            DWTELEM *buf= b->buf;
3895
            int64_t error=0;
3896

    
3897
            memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3898
            buf[b->width/2 + b->height/2*b->stride]= 256*256;
3899
            ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3900
            for(y=0; y<height; y++){
3901
                for(x=0; x<width; x++){
3902
                    int64_t d= s->spatial_dwt_buffer[x + y*width];
3903
                    error += d*d;
3904
                }
3905
            }
3906

    
3907
            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3908
//            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3909
        }
3910
    }
3911
}
3912

    
3913
static int encode_init(AVCodecContext *avctx)
3914
{
3915
    SnowContext *s = avctx->priv_data;
3916
    int plane_index;
3917

    
3918
    if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3919
        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3920
               "use vstrict=-2 / -strict -2 to use it anyway\n");
3921
        return -1;
3922
    }
3923

    
3924
    if(avctx->prediction_method == DWT_97
3925
       && (avctx->flags & CODEC_FLAG_QSCALE)
3926
       && avctx->global_quality == 0){
3927
        av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
3928
        return -1;
3929
    }
3930

    
3931
    common_init(avctx);
3932
    alloc_blocks(s);
3933

    
3934
    s->version=0;
3935

    
3936
    s->m.avctx   = avctx;
3937
    s->m.flags   = avctx->flags;
3938
    s->m.bit_rate= avctx->bit_rate;
3939

    
3940
    s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3941
    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3942
    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3943
    s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3944
    h263_encode_init(&s->m); //mv_penalty
3945

    
3946
    s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
3947

    
3948
    if(avctx->flags&CODEC_FLAG_PASS1){
3949
        if(!avctx->stats_out)
3950
            avctx->stats_out = av_mallocz(256);
3951
    }
3952
    if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
3953
        if(ff_rate_control_init(&s->m) < 0)
3954
            return -1;
3955
    }
3956
    s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
3957

    
3958
    for(plane_index=0; plane_index<3; plane_index++){
3959
        calculate_vissual_weight(s, &s->plane[plane_index]);
3960
    }
3961

    
3962

    
3963
    avctx->coded_frame= &s->current_picture;
3964
    switch(avctx->pix_fmt){
3965
//    case PIX_FMT_YUV444P:
3966
//    case PIX_FMT_YUV422P:
3967
    case PIX_FMT_YUV420P:
3968
    case PIX_FMT_GRAY8:
3969
//    case PIX_FMT_YUV411P:
3970
//    case PIX_FMT_YUV410P:
3971
        s->colorspace_type= 0;
3972
        break;
3973
/*    case PIX_FMT_RGBA32:
3974
        s->colorspace= 1;
3975
        break;*/
3976
    default:
3977
        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
3978
        return -1;
3979
    }
3980
//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
3981
    s->chroma_h_shift= 1;
3982
    s->chroma_v_shift= 1;
3983

    
3984
    ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
3985
    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
3986

    
3987
    s->avctx->get_buffer(s->avctx, &s->input_picture);
3988

    
3989
    if(s->avctx->me_method == ME_ITER){
3990
        int i;
3991
        int size= s->b_width * s->b_height << 2*s->block_max_depth;
3992
        for(i=0; i<s->max_ref_frames; i++){
3993
            s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
3994
            s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
3995
        }
3996
    }
3997

    
3998
    return 0;
3999
}
4000

    
4001
static int frame_start(SnowContext *s){
4002
   AVFrame tmp;
4003
   int w= s->avctx->width; //FIXME round up to x16 ?
4004
   int h= s->avctx->height;
4005

    
4006
    if(s->current_picture.data[0]){
4007
        draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4008
        draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4009
        draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4010
    }
4011

    
4012
    tmp= s->last_picture[s->max_ref_frames-1];
4013
    memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4014
    s->last_picture[0]= s->current_picture;
4015
    s->current_picture= tmp;
4016

    
4017
    if(s->keyframe){
4018
        s->ref_frames= 0;
4019
    }else{
4020
        int i;
4021
        for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4022
            if(i && s->last_picture[i-1].key_frame)
4023
                break;
4024
        s->ref_frames= i;
4025
    }
4026

    
4027
    s->current_picture.reference= 1;
4028
    if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
4029
        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
4030
        return -1;
4031
    }
4032

    
4033
    s->current_picture.key_frame= s->keyframe;
4034

    
4035
    return 0;
4036
}
4037

    
4038
static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
4039
    SnowContext *s = avctx->priv_data;
4040
    RangeCoder * const c= &s->c;
4041
    AVFrame *pict = data;
4042
    const int width= s->avctx->width;
4043
    const int height= s->avctx->height;
4044
    int level, orientation, plane_index, i, y;
4045
    uint8_t rc_header_bak[sizeof(s->header_state)];
4046
    uint8_t rc_block_bak[sizeof(s-&