Statistics
| Branch: | Revision:

ffmpeg / libavcodec / snow.c @ d3ac6ed6

History | View | Annotate | Download (160 KB)

1
/*
2
 * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This library is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Lesser General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2 of the License, or (at your option) any later version.
8
 *
9
 * This library is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Lesser General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Lesser General Public
15
 * License along with this library; if not, write to the Free Software
16
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
 */
18

    
19
#include "avcodec.h"
20
#include "common.h"
21
#include "dsputil.h"
22

    
23
#include "rangecoder.h"
24
#define MID_STATE 128
25

    
26
#include "mpegvideo.h"
27

    
28
#include <zlib.h>
29

    
30
#undef NDEBUG
31
#include <assert.h>
32

    
33
#define MAX_DECOMPOSITIONS 8
34
#define MAX_PLANES 4
35
#define DWTELEM int
36
#define QSHIFT 5
37
#define QROOT (1<<QSHIFT)
38
#define LOSSLESS_QLOG -128
39
#define FRAC_BITS 8
40

    
41
static const int8_t quant3[256]={
42
 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
51
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
52
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
53
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
54
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
55
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
56
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
57
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
58
};
59
static const int8_t quant3b[256]={
60
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
69
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
70
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
71
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
72
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
73
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
74
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
75
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
76
};
77
static const int8_t quant3bA[256]={
78
 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
79
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
80
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
81
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
82
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
83
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
84
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
85
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
86
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
87
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
88
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
89
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
90
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
91
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
92
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
93
 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
94
};
95
static const int8_t quant5[256]={
96
 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
97
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
98
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
99
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
100
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
101
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
102
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
103
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
104
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
105
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
106
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
107
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
108
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
109
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
110
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
111
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
112
};
113
static const int8_t quant7[256]={
114
 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
115
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
116
 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
117
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
118
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
119
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
120
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
121
 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
122
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
123
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
124
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
125
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
126
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
127
-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
128
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
129
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
130
};
131
static const int8_t quant9[256]={
132
 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
133
 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
134
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
135
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
136
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
137
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
138
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
139
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
140
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
141
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
142
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
143
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
144
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
145
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
146
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
147
-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
148
};
149
static const int8_t quant11[256]={
150
 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
151
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
152
 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
153
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
154
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
155
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
156
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
157
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
158
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
159
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
160
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
161
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
162
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
163
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
164
-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
165
-4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
166
};
167
static const int8_t quant13[256]={
168
 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
169
 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
170
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
171
 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
172
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
173
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
174
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
175
 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
176
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
177
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
178
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
179
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
180
-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
181
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
182
-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
183
-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
184
};
185

    
186
#define LOG2_OBMC_MAX 6
187
#define OBMC_MAX (1<<(LOG2_OBMC_MAX))
188
#if 0 //64*cubic
189
static const uint8_t obmc32[1024]={
190
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
192
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
193
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
194
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
195
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
196
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
197
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
198
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
199
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
200
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
201
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
202
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
203
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
204
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
205
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
206
 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
207
 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
208
 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
209
 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
210
 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
211
 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
212
 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
213
 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
214
 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
215
 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
216
 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
217
 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
218
 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
219
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
220
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
221
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
222
//error:0.000022
223
};
224
static const uint8_t obmc16[256]={
225
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
226
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
227
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
228
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
229
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
230
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
231
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
232
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
233
 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
234
 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
235
 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
236
 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
237
 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
238
 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
239
 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
240
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
241
//error:0.000033
242
};
243
#elif 1 // 64*linear
244
static const uint8_t obmc32[1024]={
245
 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
246
 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
247
 0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
248
 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,13,12,11,10, 9, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0,
249
 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,14,15,16,17,17,16,15,14,13,12,11,10, 8, 7, 6, 5, 4, 3, 2, 1,
250
 1, 2, 3, 5, 6, 8, 9,10,12,13,14,16,17,19,20,21,21,20,19,17,16,14,13,12,10, 9, 8, 6, 5, 3, 2, 1,
251
 1, 2, 4, 6, 7, 9,11,12,14,15,17,19,20,22,24,25,25,24,22,20,19,17,15,14,12,11, 9, 7, 6, 4, 2, 1,
252
 1, 3, 5, 7, 8,10,12,14,16,18,20,22,23,25,27,29,29,27,25,23,22,20,18,16,14,12,10, 8, 7, 5, 3, 1,
253
 1, 3, 5, 7,10,12,14,16,18,20,22,24,27,29,31,33,33,31,29,27,24,22,20,18,16,14,12,10, 7, 5, 3, 1,
254
 1, 4, 6, 8,11,13,15,18,20,23,25,27,30,32,34,37,37,34,32,30,27,25,23,20,18,15,13,11, 8, 6, 4, 1,
255
 1, 4, 7, 9,12,14,17,20,22,25,28,30,33,35,38,41,41,38,35,33,30,28,25,22,20,17,14,12, 9, 7, 4, 1,
256
 1, 4, 7,10,13,16,19,22,24,27,30,33,36,39,42,45,45,42,39,36,33,30,27,24,22,19,16,13,10, 7, 4, 1,
257
 2, 5, 8,11,14,17,20,23,27,30,33,36,39,42,45,48,48,45,42,39,36,33,30,27,23,20,17,14,11, 8, 5, 2,
258
 2, 5, 8,12,15,19,22,25,29,32,35,39,42,46,49,52,52,49,46,42,39,35,32,29,25,22,19,15,12, 8, 5, 2,
259
 2, 5, 9,13,16,20,24,27,31,34,38,42,45,49,53,56,56,53,49,45,42,38,34,31,27,24,20,16,13, 9, 5, 2,
260
 2, 6,10,14,17,21,25,29,33,37,41,45,48,52,56,60,60,56,52,48,45,41,37,33,29,25,21,17,14,10, 6, 2,
261
 2, 6,10,14,17,21,25,29,33,37,41,45,48,52,56,60,60,56,52,48,45,41,37,33,29,25,21,17,14,10, 6, 2,
262
 2, 5, 9,13,16,20,24,27,31,34,38,42,45,49,53,56,56,53,49,45,42,38,34,31,27,24,20,16,13, 9, 5, 2,
263
 2, 5, 8,12,15,19,22,25,29,32,35,39,42,46,49,52,52,49,46,42,39,35,32,29,25,22,19,15,12, 8, 5, 2,
264
 2, 5, 8,11,14,17,20,23,27,30,33,36,39,42,45,48,48,45,42,39,36,33,30,27,23,20,17,14,11, 8, 5, 2,
265
 1, 4, 7,10,13,16,19,22,24,27,30,33,36,39,42,45,45,42,39,36,33,30,27,24,22,19,16,13,10, 7, 4, 1,
266
 1, 4, 7, 9,12,14,17,20,22,25,28,30,33,35,38,41,41,38,35,33,30,28,25,22,20,17,14,12, 9, 7, 4, 1,
267
 1, 4, 6, 8,11,13,15,18,20,23,25,27,30,32,34,37,37,34,32,30,27,25,23,20,18,15,13,11, 8, 6, 4, 1,
268
 1, 3, 5, 7,10,12,14,16,18,20,22,24,27,29,31,33,33,31,29,27,24,22,20,18,16,14,12,10, 7, 5, 3, 1,
269
 1, 3, 5, 7, 8,10,12,14,16,18,20,22,23,25,27,29,29,27,25,23,22,20,18,16,14,12,10, 8, 7, 5, 3, 1,
270
 1, 2, 4, 6, 7, 9,11,12,14,15,17,19,20,22,24,25,25,24,22,20,19,17,15,14,12,11, 9, 7, 6, 4, 2, 1,
271
 1, 2, 3, 5, 6, 8, 9,10,12,13,14,16,17,19,20,21,21,20,19,17,16,14,13,12,10, 9, 8, 6, 5, 3, 2, 1,
272
 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,14,15,16,17,17,16,15,14,13,12,11,10, 8, 7, 6, 5, 4, 3, 2, 1,
273
 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,13,12,11,10, 9, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0,
274
 0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
275
 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
276
 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
277
 //error:0.000020
278
};
279
static const uint8_t obmc16[256]={
280
 0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
281
 1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
282
 1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
283
 2, 5, 9,12,16,19,23,26,26,23,19,16,12, 9, 5, 2,
284
 2, 7,11,16,20,25,29,34,34,29,25,20,16,11, 7, 2,
285
 3, 8,14,19,25,30,36,41,41,36,30,25,19,14, 8, 3,
286
 3,10,16,23,29,36,42,49,49,42,36,29,23,16,10, 3,
287
 4,11,19,26,34,41,49,56,56,49,41,34,26,19,11, 4,
288
 4,11,19,26,34,41,49,56,56,49,41,34,26,19,11, 4,
289
 3,10,16,23,29,36,42,49,49,42,36,29,23,16,10, 3,
290
 3, 8,14,19,25,30,36,41,41,36,30,25,19,14, 8, 3,
291
 2, 7,11,16,20,25,29,34,34,29,25,20,16,11, 7, 2,
292
 2, 5, 9,12,16,19,23,26,26,23,19,16,12, 9, 5, 2,
293
 1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
294
 1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
295
 0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
296
//error:0.000015
297
};
298
#else //64*cos
299
static const uint8_t obmc32[1024]={
300
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
302
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
303
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
304
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
305
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
306
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
307
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
308
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
309
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
310
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
311
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
312
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
313
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
314
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
315
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
316
 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
317
 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
318
 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
319
 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
320
 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
321
 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
322
 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
323
 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
324
 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
325
 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
326
 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
327
 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
328
 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
329
 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
330
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
331
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332
//error:0.000022
333
};
334
static const uint8_t obmc16[256]={
335
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
336
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
337
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
338
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
339
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
340
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
341
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
342
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
343
 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
344
 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
345
 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
346
 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
347
 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
348
 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
349
 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
350
 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
351
//error:0.000022
352
};
353
#endif
354

    
355
//linear *64
356
static const uint8_t obmc8[64]={
357
 1, 3, 5, 7, 7, 5, 3, 1,
358
 3, 9,15,21,21,15, 9, 3,
359
 5,15,25,35,35,25,15, 5,
360
 7,21,35,49,49,35,21, 7,
361
 7,21,35,49,49,35,21, 7,
362
 5,15,25,35,35,25,15, 5,
363
 3, 9,15,21,21,15, 9, 3,
364
 1, 3, 5, 7, 7, 5, 3, 1,
365
//error:0.000000
366
};
367

    
368
//linear *64
369
static const uint8_t obmc4[16]={
370
 4,12,12, 4,
371
12,36,36,12,
372
12,36,36,12,
373
 4,12,12, 4,
374
//error:0.000000
375
};
376

    
377
static const uint8_t *obmc_tab[4]={
378
    obmc32, obmc16, obmc8, obmc4
379
};
380

    
381
typedef struct BlockNode{
382
    int16_t mx;
383
    int16_t my;
384
    uint8_t color[3];
385
    uint8_t type;
386
//#define TYPE_SPLIT    1
387
#define BLOCK_INTRA   1
388
#define BLOCK_OPT     2
389
//#define TYPE_NOCOLOR  4
390
    uint8_t level; //FIXME merge into type?
391
}BlockNode;
392

    
393
static const BlockNode null_block= { //FIXME add border maybe
394
    .color= {128,128,128},
395
    .mx= 0,
396
    .my= 0,
397
    .type= 0,
398
    .level= 0,
399
};
400

    
401
#define LOG2_MB_SIZE 4
402
#define MB_SIZE (1<<LOG2_MB_SIZE)
403

    
404
typedef struct x_and_coeff{
405
    int16_t x;
406
    uint16_t coeff;
407
} x_and_coeff;
408

    
409
typedef struct SubBand{
410
    int level;
411
    int stride;
412
    int width;
413
    int height;
414
    int qlog;                                   ///< log(qscale)/log[2^(1/6)]
415
    DWTELEM *buf;
416
    int buf_x_offset;
417
    int buf_y_offset;
418
    int stride_line; ///< Stride measured in lines, not pixels.
419
    x_and_coeff * x_coeff;
420
    struct SubBand *parent;
421
    uint8_t state[/*7*2*/ 7 + 512][32];
422
}SubBand;
423

    
424
typedef struct Plane{
425
    int width;
426
    int height;
427
    SubBand band[MAX_DECOMPOSITIONS][4];
428
}Plane;
429

    
430
/** Used to minimize the amount of memory used in order to optimize cache performance. **/
431
typedef struct {
432
    DWTELEM * * line; ///< For use by idwt and predict_slices.
433
    DWTELEM * * data_stack; ///< Used for internal purposes.
434
    int data_stack_top;
435
    int line_count;
436
    int line_width;
437
    int data_count;
438
    DWTELEM * base_buffer; ///< Buffer that this structure is caching.
439
} slice_buffer;
440

    
441
typedef struct SnowContext{
442
//    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
443

    
444
    AVCodecContext *avctx;
445
    RangeCoder c;
446
    DSPContext dsp;
447
    AVFrame new_picture;
448
    AVFrame input_picture;              ///< new_picture with the internal linesizes
449
    AVFrame current_picture;
450
    AVFrame last_picture;
451
    AVFrame mconly_picture;
452
//     uint8_t q_context[16];
453
    uint8_t header_state[32];
454
    uint8_t block_state[128 + 32*128];
455
    int keyframe;
456
    int always_reset;
457
    int version;
458
    int spatial_decomposition_type;
459
    int temporal_decomposition_type;
460
    int spatial_decomposition_count;
461
    int temporal_decomposition_count;
462
    DWTELEM *spatial_dwt_buffer;
463
    int colorspace_type;
464
    int chroma_h_shift;
465
    int chroma_v_shift;
466
    int spatial_scalability;
467
    int qlog;
468
    int lambda;
469
    int lambda2;
470
    int mv_scale;
471
    int qbias;
472
#define QBIAS_SHIFT 3
473
    int b_width;
474
    int b_height;
475
    int block_max_depth;
476
    Plane plane[MAX_PLANES];
477
    BlockNode *block;
478
#define ME_CACHE_SIZE 1024
479
    int me_cache[ME_CACHE_SIZE];
480
    int me_cache_generation;
481
    slice_buffer sb;
482

    
483
    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
484
}SnowContext;
485

    
486
typedef struct {
487
    DWTELEM *b0;
488
    DWTELEM *b1;
489
    DWTELEM *b2;
490
    DWTELEM *b3;
491
    int y;
492
} dwt_compose_t;
493

    
494
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
495
//#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
496

    
497
static void iterative_me(SnowContext *s);
498

    
499
static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
500
{
501
    int i;
502

    
503
    buf->base_buffer = base_buffer;
504
    buf->line_count = line_count;
505
    buf->line_width = line_width;
506
    buf->data_count = max_allocated_lines;
507
    buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
508
    buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
509

    
510
    for (i = 0; i < max_allocated_lines; i++)
511
    {
512
      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
513
    }
514

    
515
    buf->data_stack_top = max_allocated_lines - 1;
516
}
517

    
518
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
519
{
520
    int offset;
521
    DWTELEM * buffer;
522

    
523
//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
524

    
525
    assert(buf->data_stack_top >= 0);
526
//  assert(!buf->line[line]);
527
    if (buf->line[line])
528
        return buf->line[line];
529

    
530
    offset = buf->line_width * line;
531
    buffer = buf->data_stack[buf->data_stack_top];
532
    buf->data_stack_top--;
533
    buf->line[line] = buffer;
534

    
535
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
536

    
537
    return buffer;
538
}
539

    
540
static void slice_buffer_release(slice_buffer * buf, int line)
541
{
542
    int offset;
543
    DWTELEM * buffer;
544

    
545
    assert(line >= 0 && line < buf->line_count);
546
    assert(buf->line[line]);
547

    
548
    offset = buf->line_width * line;
549
    buffer = buf->line[line];
550
    buf->data_stack_top++;
551
    buf->data_stack[buf->data_stack_top] = buffer;
552
    buf->line[line] = NULL;
553

    
554
//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
555
}
556

    
557
static void slice_buffer_flush(slice_buffer * buf)
558
{
559
    int i;
560
    for (i = 0; i < buf->line_count; i++)
561
    {
562
        if (buf->line[i])
563
        {
564
//      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
565
            slice_buffer_release(buf, i);
566
        }
567
    }
568
}
569

    
570
static void slice_buffer_destroy(slice_buffer * buf)
571
{
572
    int i;
573
    slice_buffer_flush(buf);
574

    
575
    for (i = buf->data_count - 1; i >= 0; i--)
576
    {
577
        assert(buf->data_stack[i]);
578
        av_free(buf->data_stack[i]);
579
    }
580
    assert(buf->data_stack);
581
    av_free(buf->data_stack);
582
    assert(buf->line);
583
    av_free(buf->line);
584
}
585

    
586
#ifdef __sgi
587
// Avoid a name clash on SGI IRIX
588
#undef qexp
589
#endif
590
#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
591
static uint8_t qexp[QROOT];
592

    
593
static inline int mirror(int v, int m){
594
    while((unsigned)v > (unsigned)m){
595
        v=-v;
596
        if(v<0) v+= 2*m;
597
    }
598
    return v;
599
}
600

    
601
static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
602
    int i;
603

    
604
    if(v){
605
        const int a= ABS(v);
606
        const int e= av_log2(a);
607
#if 1
608
        const int el= FFMIN(e, 10);
609
        put_rac(c, state+0, 0);
610

    
611
        for(i=0; i<el; i++){
612
            put_rac(c, state+1+i, 1);  //1..10
613
        }
614
        for(; i<e; i++){
615
            put_rac(c, state+1+9, 1);  //1..10
616
        }
617
        put_rac(c, state+1+FFMIN(i,9), 0);
618

    
619
        for(i=e-1; i>=el; i--){
620
            put_rac(c, state+22+9, (a>>i)&1); //22..31
621
        }
622
        for(; i>=0; i--){
623
            put_rac(c, state+22+i, (a>>i)&1); //22..31
624
        }
625

    
626
        if(is_signed)
627
            put_rac(c, state+11 + el, v < 0); //11..21
628
#else
629

    
630
        put_rac(c, state+0, 0);
631
        if(e<=9){
632
            for(i=0; i<e; i++){
633
                put_rac(c, state+1+i, 1);  //1..10
634
            }
635
            put_rac(c, state+1+i, 0);
636

    
637
            for(i=e-1; i>=0; i--){
638
                put_rac(c, state+22+i, (a>>i)&1); //22..31
639
            }
640

    
641
            if(is_signed)
642
                put_rac(c, state+11 + e, v < 0); //11..21
643
        }else{
644
            for(i=0; i<e; i++){
645
                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
646
            }
647
            put_rac(c, state+1+FFMIN(i,9), 0);
648

    
649
            for(i=e-1; i>=0; i--){
650
                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
651
            }
652

    
653
            if(is_signed)
654
                put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
655
        }
656
#endif
657
    }else{
658
        put_rac(c, state+0, 1);
659
    }
660
}
661

    
662
static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
663
    if(get_rac(c, state+0))
664
        return 0;
665
    else{
666
        int i, e, a;
667
        e= 0;
668
        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
669
            e++;
670
        }
671

    
672
        a= 1;
673
        for(i=e-1; i>=0; i--){
674
            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
675
        }
676

    
677
        if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
678
            return -a;
679
        else
680
            return a;
681
    }
682
}
683

    
684
static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
685
    int i;
686
    int r= log2>=0 ? 1<<log2 : 1;
687

    
688
    assert(v>=0);
689
    assert(log2>=-4);
690

    
691
    while(v >= r){
692
        put_rac(c, state+4+log2, 1);
693
        v -= r;
694
        log2++;
695
        if(log2>0) r+=r;
696
    }
697
    put_rac(c, state+4+log2, 0);
698

    
699
    for(i=log2-1; i>=0; i--){
700
        put_rac(c, state+31-i, (v>>i)&1);
701
    }
702
}
703

    
704
static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
705
    int i;
706
    int r= log2>=0 ? 1<<log2 : 1;
707
    int v=0;
708

    
709
    assert(log2>=-4);
710

    
711
    while(get_rac(c, state+4+log2)){
712
        v+= r;
713
        log2++;
714
        if(log2>0) r+=r;
715
    }
716

    
717
    for(i=log2-1; i>=0; i--){
718
        v+= get_rac(c, state+31-i)<<i;
719
    }
720

    
721
    return v;
722
}
723

    
724
static always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
725
    const int mirror_left= !highpass;
726
    const int mirror_right= (width&1) ^ highpass;
727
    const int w= (width>>1) - 1 + (highpass & width);
728
    int i;
729

    
730
#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
731
    if(mirror_left){
732
        dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
733
        dst += dst_step;
734
        src += src_step;
735
    }
736

    
737
    for(i=0; i<w; i++){
738
        dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
739
    }
740

    
741
    if(mirror_right){
742
        dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
743
    }
744
}
745

    
746
static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
747
    const int mirror_left= !highpass;
748
    const int mirror_right= (width&1) ^ highpass;
749
    const int w= (width>>1) - 1 + (highpass & width);
750
    int i;
751

    
752
    if(mirror_left){
753
        int r= 3*2*ref[0];
754
        r += r>>4;
755
        r += r>>8;
756
        dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
757
        dst += dst_step;
758
        src += src_step;
759
    }
760

    
761
    for(i=0; i<w; i++){
762
        int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
763
        r += r>>4;
764
        r += r>>8;
765
        dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
766
    }
767

    
768
    if(mirror_right){
769
        int r= 3*2*ref[w*ref_step];
770
        r += r>>4;
771
        r += r>>8;
772
        dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
773
    }
774
}
775

    
776
static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
777
    const int mirror_left= !highpass;
778
    const int mirror_right= (width&1) ^ highpass;
779
    const int w= (width>>1) - 1 + (highpass & width);
780
    int i;
781

    
782
    assert(shift == 4);
783
#define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
784
    if(mirror_left){
785
        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
786
        dst += dst_step;
787
        src += src_step;
788
    }
789

    
790
    for(i=0; i<w; i++){
791
        dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
792
    }
793

    
794
    if(mirror_right){
795
        dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
796
    }
797
}
798

    
799

    
800
static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
801
    int x, i;
802

    
803
    for(x=start; x<width; x+=2){
804
        int64_t sum=0;
805

    
806
        for(i=0; i<n; i++){
807
            int x2= x + 2*i - n + 1;
808
            if     (x2<     0) x2= -x2;
809
            else if(x2>=width) x2= 2*width-x2-2;
810
            sum += coeffs[i]*(int64_t)dst[x2];
811
        }
812
        if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
813
        else        dst[x] += (sum + (1<<shift)/2)>>shift;
814
    }
815
}
816

    
817
static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
818
    int x, y, i;
819
    for(y=start; y<height; y+=2){
820
        for(x=0; x<width; x++){
821
            int64_t sum=0;
822

    
823
            for(i=0; i<n; i++){
824
                int y2= y + 2*i - n + 1;
825
                if     (y2<      0) y2= -y2;
826
                else if(y2>=height) y2= 2*height-y2-2;
827
                sum += coeffs[i]*(int64_t)dst[x + y2*stride];
828
            }
829
            if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
830
            else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
831
        }
832
    }
833
}
834

    
835
#define SCALEX 1
836
#define LX0 0
837
#define LX1 1
838

    
839
#if 0 // more accurate 9/7
840
#define N1 2
841
#define SHIFT1 14
842
#define COEFFS1 (int[]){-25987,-25987}
843
#define N2 2
844
#define SHIFT2 19
845
#define COEFFS2 (int[]){-27777,-27777}
846
#define N3 2
847
#define SHIFT3 15
848
#define COEFFS3 (int[]){28931,28931}
849
#define N4 2
850
#define SHIFT4 15
851
#define COEFFS4 (int[]){14533,14533}
852
#elif 1 // 13/7 CRF
853
#define N1 4
854
#define SHIFT1 4
855
#define COEFFS1 (int[]){1,-9,-9,1}
856
#define N2 4
857
#define SHIFT2 4
858
#define COEFFS2 (int[]){-1,5,5,-1}
859
#define N3 0
860
#define SHIFT3 1
861
#define COEFFS3 NULL
862
#define N4 0
863
#define SHIFT4 1
864
#define COEFFS4 NULL
865
#elif 1 // 3/5
866
#define LX0 1
867
#define LX1 0
868
#define SCALEX 0.5
869
#define N1 2
870
#define SHIFT1 1
871
#define COEFFS1 (int[]){1,1}
872
#define N2 2
873
#define SHIFT2 2
874
#define COEFFS2 (int[]){-1,-1}
875
#define N3 0
876
#define SHIFT3 0
877
#define COEFFS3 NULL
878
#define N4 0
879
#define SHIFT4 0
880
#define COEFFS4 NULL
881
#elif 1 // 11/5
882
#define N1 0
883
#define SHIFT1 1
884
#define COEFFS1 NULL
885
#define N2 2
886
#define SHIFT2 2
887
#define COEFFS2 (int[]){-1,-1}
888
#define N3 2
889
#define SHIFT3 0
890
#define COEFFS3 (int[]){-1,-1}
891
#define N4 4
892
#define SHIFT4 7
893
#define COEFFS4 (int[]){-5,29,29,-5}
894
#define SCALEX 4
895
#elif 1 // 9/7 CDF
896
#define N1 2
897
#define SHIFT1 7
898
#define COEFFS1 (int[]){-203,-203}
899
#define N2 2
900
#define SHIFT2 12
901
#define COEFFS2 (int[]){-217,-217}
902
#define N3 2
903
#define SHIFT3 7
904
#define COEFFS3 (int[]){113,113}
905
#define N4 2
906
#define SHIFT4 9
907
#define COEFFS4 (int[]){227,227}
908
#define SCALEX 1
909
#elif 1 // 7/5 CDF
910
#define N1 0
911
#define SHIFT1 1
912
#define COEFFS1 NULL
913
#define N2 2
914
#define SHIFT2 2
915
#define COEFFS2 (int[]){-1,-1}
916
#define N3 2
917
#define SHIFT3 0
918
#define COEFFS3 (int[]){-1,-1}
919
#define N4 2
920
#define SHIFT4 4
921
#define COEFFS4 (int[]){3,3}
922
#elif 1 // 9/7 MN
923
#define N1 4
924
#define SHIFT1 4
925
#define COEFFS1 (int[]){1,-9,-9,1}
926
#define N2 2
927
#define SHIFT2 2
928
#define COEFFS2 (int[]){1,1}
929
#define N3 0
930
#define SHIFT3 1
931
#define COEFFS3 NULL
932
#define N4 0
933
#define SHIFT4 1
934
#define COEFFS4 NULL
935
#else // 13/7 CRF
936
#define N1 4
937
#define SHIFT1 4
938
#define COEFFS1 (int[]){1,-9,-9,1}
939
#define N2 4
940
#define SHIFT2 4
941
#define COEFFS2 (int[]){-1,5,5,-1}
942
#define N3 0
943
#define SHIFT3 1
944
#define COEFFS3 NULL
945
#define N4 0
946
#define SHIFT4 1
947
#define COEFFS4 NULL
948
#endif
949
static void horizontal_decomposeX(DWTELEM *b, int width){
950
    DWTELEM temp[width];
951
    const int width2= width>>1;
952
    const int w2= (width+1)>>1;
953
    int x;
954

    
955
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
956
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
957
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
958
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
959

    
960
    for(x=0; x<width2; x++){
961
        temp[x   ]= b[2*x    ];
962
        temp[x+w2]= b[2*x + 1];
963
    }
964
    if(width&1)
965
        temp[x   ]= b[2*x    ];
966
    memcpy(b, temp, width*sizeof(int));
967
}
968

    
969
static void horizontal_composeX(DWTELEM *b, int width){
970
    DWTELEM temp[width];
971
    const int width2= width>>1;
972
    int x;
973
    const int w2= (width+1)>>1;
974

    
975
    memcpy(temp, b, width*sizeof(int));
976
    for(x=0; x<width2; x++){
977
        b[2*x    ]= temp[x   ];
978
        b[2*x + 1]= temp[x+w2];
979
    }
980
    if(width&1)
981
        b[2*x    ]= temp[x   ];
982

    
983
    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
984
    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
985
    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
986
    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
987
}
988

    
989
static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
990
    int x, y;
991

    
992
    for(y=0; y<height; y++){
993
        for(x=0; x<width; x++){
994
            buffer[y*stride + x] *= SCALEX;
995
        }
996
    }
997

    
998
    for(y=0; y<height; y++){
999
        horizontal_decomposeX(buffer + y*stride, width);
1000
    }
1001

    
1002
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
1003
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
1004
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
1005
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
1006
}
1007

    
1008
static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
1009
    int x, y;
1010

    
1011
    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1012
    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1013
    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1014
    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1015

    
1016
    for(y=0; y<height; y++){
1017
        horizontal_composeX(buffer + y*stride, width);
1018
    }
1019

    
1020
    for(y=0; y<height; y++){
1021
        for(x=0; x<width; x++){
1022
            buffer[y*stride + x] /= SCALEX;
1023
        }
1024
    }
1025
}
1026

    
1027
static void horizontal_decompose53i(DWTELEM *b, int width){
1028
    DWTELEM temp[width];
1029
    const int width2= width>>1;
1030
    int x;
1031
    const int w2= (width+1)>>1;
1032

    
1033
    for(x=0; x<width2; x++){
1034
        temp[x   ]= b[2*x    ];
1035
        temp[x+w2]= b[2*x + 1];
1036
    }
1037
    if(width&1)
1038
        temp[x   ]= b[2*x    ];
1039
#if 0
1040
    {
1041
    int A1,A2,A3,A4;
1042
    A2= temp[1       ];
1043
    A4= temp[0       ];
1044
    A1= temp[0+width2];
1045
    A1 -= (A2 + A4)>>1;
1046
    A4 += (A1 + 1)>>1;
1047
    b[0+width2] = A1;
1048
    b[0       ] = A4;
1049
    for(x=1; x+1<width2; x+=2){
1050
        A3= temp[x+width2];
1051
        A4= temp[x+1     ];
1052
        A3 -= (A2 + A4)>>1;
1053
        A2 += (A1 + A3 + 2)>>2;
1054
        b[x+width2] = A3;
1055
        b[x       ] = A2;
1056

1057
        A1= temp[x+1+width2];
1058
        A2= temp[x+2       ];
1059
        A1 -= (A2 + A4)>>1;
1060
        A4 += (A1 + A3 + 2)>>2;
1061
        b[x+1+width2] = A1;
1062
        b[x+1       ] = A4;
1063
    }
1064
    A3= temp[width-1];
1065
    A3 -= A2;
1066
    A2 += (A1 + A3 + 2)>>2;
1067
    b[width -1] = A3;
1068
    b[width2-1] = A2;
1069
    }
1070
#else
1071
    lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1072
    lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1073
#endif
1074
}
1075

    
1076
static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1077
    int i;
1078

    
1079
    for(i=0; i<width; i++){
1080
        b1[i] -= (b0[i] + b2[i])>>1;
1081
    }
1082
}
1083

    
1084
static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1085
    int i;
1086

    
1087
    for(i=0; i<width; i++){
1088
        b1[i] += (b0[i] + b2[i] + 2)>>2;
1089
    }
1090
}
1091

    
1092
static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1093
    int y;
1094
    DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1095
    DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1096

    
1097
    for(y=-2; y<height; y+=2){
1098
        DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1099
        DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1100

    
1101
{START_TIMER
1102
        if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1103
        if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1104
STOP_TIMER("horizontal_decompose53i")}
1105

    
1106
{START_TIMER
1107
        if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1108
        if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1109
STOP_TIMER("vertical_decompose53i*")}
1110

    
1111
        b0=b2;
1112
        b1=b3;
1113
    }
1114
}
1115

    
1116
#define liftS lift
1117
#define lift5 lift
1118
#if 1
1119
#define W_AM 3
1120
#define W_AO 0
1121
#define W_AS 1
1122

    
1123
#undef liftS
1124
#define W_BM 1
1125
#define W_BO 8
1126
#define W_BS 4
1127

    
1128
#define W_CM 1
1129
#define W_CO 0
1130
#define W_CS 0
1131

    
1132
#define W_DM 3
1133
#define W_DO 4
1134
#define W_DS 3
1135
#elif 0
1136
#define W_AM 55
1137
#define W_AO 16
1138
#define W_AS 5
1139

    
1140
#define W_BM 3
1141
#define W_BO 32
1142
#define W_BS 6
1143

    
1144
#define W_CM 127
1145
#define W_CO 64
1146
#define W_CS 7
1147

    
1148
#define W_DM 7
1149
#define W_DO 8
1150
#define W_DS 4
1151
#elif 0
1152
#define W_AM 97
1153
#define W_AO 32
1154
#define W_AS 6
1155

    
1156
#define W_BM 63
1157
#define W_BO 512
1158
#define W_BS 10
1159

    
1160
#define W_CM 13
1161
#define W_CO 8
1162
#define W_CS 4
1163

    
1164
#define W_DM 15
1165
#define W_DO 16
1166
#define W_DS 5
1167

    
1168
#else
1169

    
1170
#define W_AM 203
1171
#define W_AO 64
1172
#define W_AS 7
1173

    
1174
#define W_BM 217
1175
#define W_BO 2048
1176
#define W_BS 12
1177

    
1178
#define W_CM 113
1179
#define W_CO 64
1180
#define W_CS 7
1181

    
1182
#define W_DM 227
1183
#define W_DO 128
1184
#define W_DS 9
1185
#endif
1186
static void horizontal_decompose97i(DWTELEM *b, int width){
1187
    DWTELEM temp[width];
1188
    const int w2= (width+1)>>1;
1189

    
1190
    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1191
    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1192
    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1193
    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1194
}
1195

    
1196

    
1197
static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1198
    int i;
1199

    
1200
    for(i=0; i<width; i++){
1201
        b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1202
    }
1203
}
1204

    
1205
static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1206
    int i;
1207

    
1208
    for(i=0; i<width; i++){
1209
#ifdef lift5
1210
        b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1211
#else
1212
        int r= 3*(b0[i] + b2[i]);
1213
        r+= r>>4;
1214
        r+= r>>8;
1215
        b1[i] += (r+W_CO)>>W_CS;
1216
#endif
1217
    }
1218
}
1219

    
1220
static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1221
    int i;
1222

    
1223
    for(i=0; i<width; i++){
1224
#ifdef liftS
1225
        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1226
#else
1227
        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1228
#endif
1229
    }
1230
}
1231

    
1232
static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1233
    int i;
1234

    
1235
    for(i=0; i<width; i++){
1236
        b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1237
    }
1238
}
1239

    
1240
static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1241
    int y;
1242
    DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1243
    DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1244
    DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1245
    DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1246

    
1247
    for(y=-4; y<height; y+=2){
1248
        DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1249
        DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1250

    
1251
{START_TIMER
1252
        if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1253
        if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1254
if(width>400){
1255
STOP_TIMER("horizontal_decompose97i")
1256
}}
1257

    
1258
{START_TIMER
1259
        if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1260
        if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1261
        if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1262
        if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1263

    
1264
if(width>400){
1265
STOP_TIMER("vertical_decompose97i")
1266
}}
1267

    
1268
        b0=b2;
1269
        b1=b3;
1270
        b2=b4;
1271
        b3=b5;
1272
    }
1273
}
1274

    
1275
void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1276
    int level;
1277

    
1278
    for(level=0; level<decomposition_count; level++){
1279
        switch(type){
1280
        case 0: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1281
        case 1: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1282
        case 2: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1283
        }
1284
    }
1285
}
1286

    
1287
static void horizontal_compose53i(DWTELEM *b, int width){
1288
    DWTELEM temp[width];
1289
    const int width2= width>>1;
1290
    const int w2= (width+1)>>1;
1291
    int x;
1292

    
1293
#if 0
1294
    int A1,A2,A3,A4;
1295
    A2= temp[1       ];
1296
    A4= temp[0       ];
1297
    A1= temp[0+width2];
1298
    A1 -= (A2 + A4)>>1;
1299
    A4 += (A1 + 1)>>1;
1300
    b[0+width2] = A1;
1301
    b[0       ] = A4;
1302
    for(x=1; x+1<width2; x+=2){
1303
        A3= temp[x+width2];
1304
        A4= temp[x+1     ];
1305
        A3 -= (A2 + A4)>>1;
1306
        A2 += (A1 + A3 + 2)>>2;
1307
        b[x+width2] = A3;
1308
        b[x       ] = A2;
1309

1310
        A1= temp[x+1+width2];
1311
        A2= temp[x+2       ];
1312
        A1 -= (A2 + A4)>>1;
1313
        A4 += (A1 + A3 + 2)>>2;
1314
        b[x+1+width2] = A1;
1315
        b[x+1       ] = A4;
1316
    }
1317
    A3= temp[width-1];
1318
    A3 -= A2;
1319
    A2 += (A1 + A3 + 2)>>2;
1320
    b[width -1] = A3;
1321
    b[width2-1] = A2;
1322
#else
1323
    lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1324
    lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1325
#endif
1326
    for(x=0; x<width2; x++){
1327
        b[2*x    ]= temp[x   ];
1328
        b[2*x + 1]= temp[x+w2];
1329
    }
1330
    if(width&1)
1331
        b[2*x    ]= temp[x   ];
1332
}
1333

    
1334
static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1335
    int i;
1336

    
1337
    for(i=0; i<width; i++){
1338
        b1[i] += (b0[i] + b2[i])>>1;
1339
    }
1340
}
1341

    
1342
static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1343
    int i;
1344

    
1345
    for(i=0; i<width; i++){
1346
        b1[i] -= (b0[i] + b2[i] + 2)>>2;
1347
    }
1348
}
1349

    
1350
static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1351
    cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1352
    cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1353
    cs->y = -1;
1354
}
1355

    
1356
static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1357
    cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1358
    cs->b1 = buffer + mirror(-1  , height-1)*stride;
1359
    cs->y = -1;
1360
}
1361

    
1362
static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1363
    int y= cs->y;
1364

    
1365
    DWTELEM *b0= cs->b0;
1366
    DWTELEM *b1= cs->b1;
1367
    DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1368
    DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1369

    
1370
{START_TIMER
1371
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1372
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1373
STOP_TIMER("vertical_compose53i*")}
1374

    
1375
{START_TIMER
1376
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1377
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1378
STOP_TIMER("horizontal_compose53i")}
1379

    
1380
    cs->b0 = b2;
1381
    cs->b1 = b3;
1382
    cs->y += 2;
1383
}
1384

    
1385
static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1386
    int y= cs->y;
1387
    DWTELEM *b0= cs->b0;
1388
    DWTELEM *b1= cs->b1;
1389
    DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1390
    DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1391

    
1392
{START_TIMER
1393
        if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1394
        if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1395
STOP_TIMER("vertical_compose53i*")}
1396

    
1397
{START_TIMER
1398
        if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1399
        if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1400
STOP_TIMER("horizontal_compose53i")}
1401

    
1402
    cs->b0 = b2;
1403
    cs->b1 = b3;
1404
    cs->y += 2;
1405
}
1406

    
1407
static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1408
    dwt_compose_t cs;
1409
    spatial_compose53i_init(&cs, buffer, height, stride);
1410
    while(cs.y <= height)
1411
        spatial_compose53i_dy(&cs, buffer, width, height, stride);
1412
}
1413

    
1414

    
1415
static void horizontal_compose97i(DWTELEM *b, int width){
1416
    DWTELEM temp[width];
1417
    const int w2= (width+1)>>1;
1418

    
1419
    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1420
    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1421
    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1422
    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1423
}
1424

    
1425
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1426
    int i;
1427

    
1428
    for(i=0; i<width; i++){
1429
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1430
    }
1431
}
1432

    
1433
static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1434
    int i;
1435

    
1436
    for(i=0; i<width; i++){
1437
#ifdef lift5
1438
        b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1439
#else
1440
        int r= 3*(b0[i] + b2[i]);
1441
        r+= r>>4;
1442
        r+= r>>8;
1443
        b1[i] -= (r+W_CO)>>W_CS;
1444
#endif
1445
    }
1446
}
1447

    
1448
static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1449
    int i;
1450

    
1451
    for(i=0; i<width; i++){
1452
#ifdef liftS
1453
        b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1454
#else
1455
        b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1456
#endif
1457
    }
1458
}
1459

    
1460
static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1461
    int i;
1462

    
1463
    for(i=0; i<width; i++){
1464
        b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1465
    }
1466
}
1467

    
1468
static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1469
    int i;
1470

    
1471
    for(i=0; i<width; i++){
1472
#ifndef lift5
1473
        int r;
1474
#endif
1475
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1476
#ifdef lift5
1477
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1478
#else
1479
        r= 3*(b2[i] + b4[i]);
1480
        r+= r>>4;
1481
        r+= r>>8;
1482
        b3[i] -= (r+W_CO)>>W_CS;
1483
#endif
1484
#ifdef liftS
1485
        b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1486
#else
1487
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1488
#endif
1489
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1490
    }
1491
}
1492

    
1493
static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1494
    cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1495
    cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1496
    cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1497
    cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1498
    cs->y = -3;
1499
}
1500

    
1501
static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1502
    cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1503
    cs->b1 = buffer + mirror(-3  , height-1)*stride;
1504
    cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1505
    cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1506
    cs->y = -3;
1507
}
1508

    
1509
static void spatial_compose97i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1510
    int y = cs->y;
1511

    
1512
    DWTELEM *b0= cs->b0;
1513
    DWTELEM *b1= cs->b1;
1514
    DWTELEM *b2= cs->b2;
1515
    DWTELEM *b3= cs->b3;
1516
    DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1517
    DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1518

    
1519
{START_TIMER
1520
    if(y>0 && y+4<height){
1521
        vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1522
    }else{
1523
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1524
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1525
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1526
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1527
    }
1528
if(width>400){
1529
STOP_TIMER("vertical_compose97i")}}
1530

    
1531
{START_TIMER
1532
        if(y-1<(unsigned)height) horizontal_compose97i(b0, width);
1533
        if(y+0<(unsigned)height) horizontal_compose97i(b1, width);
1534
if(width>400 && y+0<(unsigned)height){
1535
STOP_TIMER("horizontal_compose97i")}}
1536

    
1537
    cs->b0=b2;
1538
    cs->b1=b3;
1539
    cs->b2=b4;
1540
    cs->b3=b5;
1541
    cs->y += 2;
1542
}
1543

    
1544
static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1545
    int y = cs->y;
1546
    DWTELEM *b0= cs->b0;
1547
    DWTELEM *b1= cs->b1;
1548
    DWTELEM *b2= cs->b2;
1549
    DWTELEM *b3= cs->b3;
1550
    DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1551
    DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1552

    
1553
{START_TIMER
1554
        if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1555
        if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1556
        if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1557
        if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1558
if(width>400){
1559
STOP_TIMER("vertical_compose97i")}}
1560

    
1561
{START_TIMER
1562
        if(y-1<(unsigned)height) horizontal_compose97i(b0, width);
1563
        if(y+0<(unsigned)height) horizontal_compose97i(b1, width);
1564
if(width>400 && b0 <= b2){
1565
STOP_TIMER("horizontal_compose97i")}}
1566

    
1567
    cs->b0=b2;
1568
    cs->b1=b3;
1569
    cs->b2=b4;
1570
    cs->b3=b5;
1571
    cs->y += 2;
1572
}
1573

    
1574
static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1575
    dwt_compose_t cs;
1576
    spatial_compose97i_init(&cs, buffer, height, stride);
1577
    while(cs.y <= height)
1578
        spatial_compose97i_dy(&cs, buffer, width, height, stride);
1579
}
1580

    
1581
void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1582
    int level;
1583
    for(level=decomposition_count-1; level>=0; level--){
1584
        switch(type){
1585
        case 0: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1586
        case 1: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1587
        /* not slicified yet */
1588
        case 2: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1589
          av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1590
        }
1591
    }
1592
}
1593

    
1594
void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1595
    int level;
1596
    for(level=decomposition_count-1; level>=0; level--){
1597
        switch(type){
1598
        case 0: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1599
        case 1: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1600
        /* not slicified yet */
1601
        case 2: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1602
        }
1603
    }
1604
}
1605

    
1606
void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1607
    const int support = type==1 ? 3 : 5;
1608
    int level;
1609
    if(type==2) return;
1610

    
1611
    for(level=decomposition_count-1; level>=0; level--){
1612
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1613
            switch(type){
1614
            case 0: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1615
                    break;
1616
            case 1: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1617
                    break;
1618
            case 2: break;
1619
            }
1620
        }
1621
    }
1622
}
1623

    
1624
void ff_spatial_idwt_buffered_slice(dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1625
    const int support = type==1 ? 3 : 5;
1626
    int level;
1627
    if(type==2) return;
1628

    
1629
    for(level=decomposition_count-1; level>=0; level--){
1630
        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1631
            switch(type){
1632
            case 0: spatial_compose97i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1633
                    break;
1634
            case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1635
                    break;
1636
            case 2: break;
1637
            }
1638
        }
1639
    }
1640
}
1641

    
1642
void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1643
    if(type==2){
1644
        int level;
1645
        for(level=decomposition_count-1; level>=0; level--)
1646
            spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1647
    }else{
1648
        dwt_compose_t cs[MAX_DECOMPOSITIONS];
1649
        int y;
1650
        ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1651
        for(y=0; y<height; y+=4)
1652
            ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1653
    }
1654
}
1655

    
1656
static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1657
    const int w= b->width;
1658
    const int h= b->height;
1659
    int x, y;
1660

    
1661
    if(1){
1662
        int run=0;
1663
        int runs[w*h];
1664
        int run_index=0;
1665
        int max_index;
1666

    
1667
        for(y=0; y<h; y++){
1668
            for(x=0; x<w; x++){
1669
                int v, p=0;
1670
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1671
                v= src[x + y*stride];
1672

    
1673
                if(y){
1674
                    t= src[x + (y-1)*stride];
1675
                    if(x){
1676
                        lt= src[x - 1 + (y-1)*stride];
1677
                    }
1678
                    if(x + 1 < w){
1679
                        rt= src[x + 1 + (y-1)*stride];
1680
                    }
1681
                }
1682
                if(x){
1683
                    l= src[x - 1 + y*stride];
1684
                    /*if(x > 1){
1685
                        if(orientation==1) ll= src[y + (x-2)*stride];
1686
                        else               ll= src[x - 2 + y*stride];
1687
                    }*/
1688
                }
1689
                if(parent){
1690
                    int px= x>>1;
1691
                    int py= y>>1;
1692
                    if(px<b->parent->width && py<b->parent->height)
1693
                        p= parent[px + py*2*stride];
1694
                }
1695
                if(!(/*ll|*/l|lt|t|rt|p)){
1696
                    if(v){
1697
                        runs[run_index++]= run;
1698
                        run=0;
1699
                    }else{
1700
                        run++;
1701
                    }
1702
                }
1703
            }
1704
        }
1705
        max_index= run_index;
1706
        runs[run_index++]= run;
1707
        run_index=0;
1708
        run= runs[run_index++];
1709

    
1710
        put_symbol2(&s->c, b->state[30], max_index, 0);
1711
        if(run_index <= max_index)
1712
            put_symbol2(&s->c, b->state[1], run, 3);
1713

    
1714
        for(y=0; y<h; y++){
1715
            if(s->c.bytestream_end - s->c.bytestream < w*40){
1716
                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1717
                return -1;
1718
            }
1719
            for(x=0; x<w; x++){
1720
                int v, p=0;
1721
                int /*ll=0, */l=0, lt=0, t=0, rt=0;
1722
                v= src[x + y*stride];
1723

    
1724
                if(y){
1725
                    t= src[x + (y-1)*stride];
1726
                    if(x){
1727
                        lt= src[x - 1 + (y-1)*stride];
1728
                    }
1729
                    if(x + 1 < w){
1730
                        rt= src[x + 1 + (y-1)*stride];
1731
                    }
1732
                }
1733
                if(x){
1734
                    l= src[x - 1 + y*stride];
1735
                    /*if(x > 1){
1736
                        if(orientation==1) ll= src[y + (x-2)*stride];
1737
                        else               ll= src[x - 2 + y*stride];
1738
                    }*/
1739
                }
1740
                if(parent){
1741
                    int px= x>>1;
1742
                    int py= y>>1;
1743
                    if(px<b->parent->width && py<b->parent->height)
1744
                        p= parent[px + py*2*stride];
1745
                }
1746
                if(/*ll|*/l|lt|t|rt|p){
1747
                    int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
1748

    
1749
                    put_rac(&s->c, &b->state[0][context], !!v);
1750
                }else{
1751
                    if(!run){
1752
                        run= runs[run_index++];
1753

    
1754
                        if(run_index <= max_index)
1755
                            put_symbol2(&s->c, b->state[1], run, 3);
1756
                        assert(v);
1757
                    }else{
1758
                        run--;
1759
                        assert(!v);
1760
                    }
1761
                }
1762
                if(v){
1763
                    int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
1764
                    int l2= 2*ABS(l) + (l<0);
1765
                    int t2= 2*ABS(t) + (t<0);
1766

    
1767
                    put_symbol2(&s->c, b->state[context + 2], ABS(v)-1, context-4);
1768
                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1769
                }
1770
            }
1771
        }
1772
    }
1773
    return 0;
1774
}
1775

    
1776
static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1777
//    encode_subband_qtree(s, b, src, parent, stride, orientation);
1778
//    encode_subband_z0run(s, b, src, parent, stride, orientation);
1779
    return encode_subband_c0run(s, b, src, parent, stride, orientation);
1780
//    encode_subband_dzr(s, b, src, parent, stride, orientation);
1781
}
1782

    
1783
static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1784
    const int w= b->width;
1785
    const int h= b->height;
1786
    int x,y;
1787

    
1788
    if(1){
1789
        int run, runs;
1790
        x_and_coeff *xc= b->x_coeff;
1791
        x_and_coeff *prev_xc= NULL;
1792
        x_and_coeff *prev2_xc= xc;
1793
        x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1794
        x_and_coeff *prev_parent_xc= parent_xc;
1795

    
1796
        runs= get_symbol2(&s->c, b->state[30], 0);
1797
        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1798
        else           run= INT_MAX;
1799

    
1800
        for(y=0; y<h; y++){
1801
            int v=0;
1802
            int lt=0, t=0, rt=0;
1803

    
1804
            if(y && prev_xc->x == 0){
1805
                rt= prev_xc->coeff;
1806
            }
1807
            for(x=0; x<w; x++){
1808
                int p=0;
1809
                const int l= v;
1810

    
1811
                lt= t; t= rt;
1812

    
1813
                if(y){
1814
                    if(prev_xc->x <= x)
1815
                        prev_xc++;
1816
                    if(prev_xc->x == x + 1)
1817
                        rt= prev_xc->coeff;
1818
                    else
1819
                        rt=0;
1820
                }
1821
                if(parent_xc){
1822
                    if(x>>1 > parent_xc->x){
1823
                        parent_xc++;
1824
                    }
1825
                    if(x>>1 == parent_xc->x){
1826
                        p= parent_xc->coeff;
1827
                    }
1828
                }
1829
                if(/*ll|*/l|lt|t|rt|p){
1830
                    int context= av_log2(/*ABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1831

    
1832
                    v=get_rac(&s->c, &b->state[0][context]);
1833
                    if(v){
1834
                        v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1835
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1836

    
1837
                        xc->x=x;
1838
                        (xc++)->coeff= v;
1839
                    }
1840
                }else{
1841
                    if(!run){
1842
                        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1843
                        else           run= INT_MAX;
1844
                        v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1845
                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1846

    
1847
                        xc->x=x;
1848
                        (xc++)->coeff= v;
1849
                    }else{
1850
                        int max_run;
1851
                        run--;
1852
                        v=0;
1853

    
1854
                        if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1855
                        else  max_run= FFMIN(run, w-x-1);
1856
                        if(parent_xc)
1857
                            max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1858
                        x+= max_run;
1859
                        run-= max_run;
1860
                    }
1861
                }
1862
            }
1863
            (xc++)->x= w+1; //end marker
1864
            prev_xc= prev2_xc;
1865
            prev2_xc= xc;
1866

    
1867
            if(parent_xc){
1868
                if(y&1){
1869
                    while(parent_xc->x != parent->width+1)
1870
                        parent_xc++;
1871
                    parent_xc++;
1872
                    prev_parent_xc= parent_xc;
1873
                }else{
1874
                    parent_xc= prev_parent_xc;
1875
                }
1876
            }
1877
        }
1878

    
1879
        (xc++)->x= w+1; //end marker
1880
    }
1881
}
1882

    
1883
static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1884
    const int w= b->width;
1885
    int y;
1886
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
1887
    int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1888
    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1889
    int new_index = 0;
1890

    
1891
    START_TIMER
1892

    
1893
    if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1894
        qadd= 0;
1895
        qmul= 1<<QEXPSHIFT;
1896
    }
1897

    
1898
    /* If we are on the second or later slice, restore our index. */
1899
    if (start_y != 0)
1900
        new_index = save_state[0];
1901

    
1902

    
1903
    for(y=start_y; y<h; y++){
1904
        int x = 0;
1905
        int v;
1906
        DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1907
        memset(line, 0, b->width*sizeof(DWTELEM));
1908
        v = b->x_coeff[new_index].coeff;
1909
        x = b->x_coeff[new_index++].x;
1910
        while(x < w)
1911
        {
1912
            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1913
            register int u= -(v&1);
1914
            line[x] = (t^u) - u;
1915

    
1916
            v = b->x_coeff[new_index].coeff;
1917
            x = b->x_coeff[new_index++].x;
1918
        }
1919
    }
1920
    if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1921
        STOP_TIMER("decode_subband")
1922
    }
1923

    
1924
    /* Save our variables for the next slice. */
1925
    save_state[0] = new_index;
1926

    
1927
    return;
1928
}
1929

    
1930
static void reset_contexts(SnowContext *s){
1931
    int plane_index, level, orientation;
1932

    
1933
    for(plane_index=0; plane_index<3; plane_index++){
1934
        for(level=0; level<s->spatial_decomposition_count; level++){
1935
            for(orientation=level ? 1:0; orientation<4; orientation++){
1936
                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1937
            }
1938
        }
1939
    }
1940
    memset(s->header_state, MID_STATE, sizeof(s->header_state));
1941
    memset(s->block_state, MID_STATE, sizeof(s->block_state));
1942
}
1943

    
1944
static int alloc_blocks(SnowContext *s){
1945
    int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1946
    int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1947

    
1948
    s->b_width = w;
1949
    s->b_height= h;
1950

    
1951
    s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1952
    return 0;
1953
}
1954

    
1955
static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1956
    uint8_t *bytestream= d->bytestream;
1957
    uint8_t *bytestream_start= d->bytestream_start;
1958
    *d= *s;
1959
    d->bytestream= bytestream;
1960
    d->bytestream_start= bytestream_start;
1961
}
1962

    
1963
//near copy & paste from dsputil, FIXME
1964
static int pix_sum(uint8_t * pix, int line_size, int w)
1965
{
1966
    int s, i, j;
1967

    
1968
    s = 0;
1969
    for (i = 0; i < w; i++) {
1970
        for (j = 0; j < w; j++) {
1971
            s += pix[0];
1972
            pix ++;
1973
        }
1974
        pix += line_size - w;
1975
    }
1976
    return s;
1977
}
1978

    
1979
//near copy & paste from dsputil, FIXME
1980
static int pix_norm1(uint8_t * pix, int line_size, int w)
1981
{
1982
    int s, i, j;
1983
    uint32_t *sq = squareTbl + 256;
1984

    
1985
    s = 0;
1986
    for (i = 0; i < w; i++) {
1987
        for (j = 0; j < w; j ++) {
1988
            s += sq[pix[0]];
1989
            pix ++;
1990
        }
1991
        pix += line_size - w;
1992
    }
1993
    return s;
1994
}
1995

    
1996
static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int type){
1997
    const int w= s->b_width << s->block_max_depth;
1998
    const int rem_depth= s->block_max_depth - level;
1999
    const int index= (x + y*w) << rem_depth;
2000
    const int block_w= 1<<rem_depth;
2001
    BlockNode block;
2002
    int i,j;
2003

    
2004
    block.color[0]= l;
2005
    block.color[1]= cb;
2006
    block.color[2]= cr;
2007
    block.mx= mx;
2008
    block.my= my;
2009
    block.type= type;
2010
    block.level= level;
2011

    
2012
    for(j=0; j<block_w; j++){
2013
        for(i=0; i<block_w; i++){
2014
            s->block[index + i + j*w]= block;
2015
        }
2016
    }
2017
}
2018

    
2019
static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
2020
    const int offset[3]= {
2021
          y*c->  stride + x,
2022
        ((y*c->uvstride + x)>>1),
2023
        ((y*c->uvstride + x)>>1),
2024
    };
2025
    int i;
2026
    for(i=0; i<3; i++){
2027
        c->src[0][i]= src [i];
2028
        c->ref[0][i]= ref [i] + offset[i];
2029
    }
2030
    assert(!ref_index);
2031
}
2032

    
2033
//FIXME copy&paste
2034
#define P_LEFT P[1]
2035
#define P_TOP P[2]
2036
#define P_TOPRIGHT P[3]
2037
#define P_MEDIAN P[4]
2038
#define P_MV1 P[9]
2039
#define FLAG_QPEL   1 //must be 1
2040

    
2041
static int encode_q_branch(SnowContext *s, int level, int x, int y){
2042
    uint8_t p_buffer[1024];
2043
    uint8_t i_buffer[1024];
2044
    uint8_t p_state[sizeof(s->block_state)];
2045
    uint8_t i_state[sizeof(s->block_state)];
2046
    RangeCoder pc, ic;
2047
    uint8_t *pbbak= s->c.bytestream;
2048
    uint8_t *pbbak_start= s->c.bytestream_start;
2049
    int score, score2, iscore, i_len, p_len, block_s, sum;
2050
    const int w= s->b_width  << s->block_max_depth;
2051
    const int h= s->b_height << s->block_max_depth;
2052
    const int rem_depth= s->block_max_depth - level;
2053
    const int index= (x + y*w) << rem_depth;
2054
    const int block_w= 1<<(LOG2_MB_SIZE - level);
2055
    int trx= (x+1)<<rem_depth;
2056
    int try= (y+1)<<rem_depth;
2057
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2058
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
2059
    BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
2060
    BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2061
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2062
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2063
    int pl = left->color[0];
2064
    int pcb= left->color[1];
2065
    int pcr= left->color[2];
2066
    int pmx= mid_pred(left->mx, top->mx, tr->mx);
2067
    int pmy= mid_pred(left->my, top->my, tr->my);
2068
    int mx=0, my=0;
2069
    int l,cr,cb;
2070
    const int stride= s->current_picture.linesize[0];
2071
    const int uvstride= s->current_picture.linesize[1];
2072
    uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2073
                                s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2074
                                s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2075
    int P[10][2];
2076
    int16_t last_mv[3][2];
2077
    int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2078
    const int shift= 1+qpel;
2079
    MotionEstContext *c= &s->m.me;
2080
    int mx_context= av_log2(2*ABS(left->mx - top->mx));
2081
    int my_context= av_log2(2*ABS(left->my - top->my));
2082
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2083

    
2084
    assert(sizeof(s->block_state) >= 256);
2085
    if(s->keyframe){
2086
        set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, BLOCK_INTRA);
2087
        return 0;
2088
    }
2089

    
2090
//    clip predictors / edge ?
2091

    
2092
    P_LEFT[0]= left->mx;
2093
    P_LEFT[1]= left->my;
2094
    P_TOP [0]= top->mx;
2095
    P_TOP [1]= top->my;
2096
    P_TOPRIGHT[0]= tr->mx;
2097
    P_TOPRIGHT[1]= tr->my;
2098

    
2099
    last_mv[0][0]= s->block[index].mx;
2100
    last_mv[0][1]= s->block[index].my;
2101
    last_mv[1][0]= right->mx;
2102
    last_mv[1][1]= right->my;
2103
    last_mv[2][0]= bottom->mx;
2104
    last_mv[2][1]= bottom->my;
2105

    
2106
    s->m.mb_stride=2;
2107
    s->m.mb_x=
2108
    s->m.mb_y= 0;
2109
    s->m.me.skip= 0;
2110

    
2111
    init_ref(c, current_data, s->last_picture.data, NULL, block_w*x, block_w*y, 0);
2112

    
2113
    assert(s->m.me.  stride ==   stride);
2114
    assert(s->m.me.uvstride == uvstride);
2115

    
2116
    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2117
    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2118
    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2119
    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2120

    
2121
    c->xmin = - x*block_w - 16+2;
2122
    c->ymin = - y*block_w - 16+2;
2123
    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2124
    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2125

    
2126
    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2127
    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2128
    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2129
    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2130
    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2131
    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2132
    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2133

    
2134
    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2135
    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2136

    
2137
    if (!y) {
2138
        c->pred_x= P_LEFT[0];
2139
        c->pred_y= P_LEFT[1];
2140
    } else {
2141
        c->pred_x = P_MEDIAN[0];
2142
        c->pred_y = P_MEDIAN[1];
2143
    }
2144

    
2145
    score= ff_epzs_motion_search(&s->m, &mx, &my, P, 0, /*ref_index*/ 0, last_mv,
2146
                             (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2147

    
2148
    assert(mx >= c->xmin);
2149
    assert(mx <= c->xmax);
2150
    assert(my >= c->ymin);
2151
    assert(my <= c->ymax);
2152

    
2153
    score= s->m.me.sub_motion_search(&s->m, &mx, &my, score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2154
    score= ff_get_mb_score(&s->m, mx, my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2155
    //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
2156

    
2157
  //  subpel search
2158
    pc= s->c;
2159
    pc.bytestream_start=
2160
    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2161
    memcpy(p_state, s->block_state, sizeof(s->block_state));
2162

    
2163
    if(level!=s->block_max_depth)
2164
        put_rac(&pc, &p_state[4 + s_context], 1);
2165
    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2166
    put_symbol(&pc, &p_state[128 + 32*mx_context], mx - pmx, 1);
2167
    put_symbol(&pc, &p_state[128 + 32*my_context], my - pmy, 1);
2168
    p_len= pc.bytestream - pc.bytestream_start;
2169
    score += (s->lambda2*(p_len*8
2170
              + (pc.outstanding_count - s->c.outstanding_count)*8
2171
              + (-av_log2(pc.range)    + av_log2(s->c.range))
2172
             ))>>FF_LAMBDA_SHIFT;
2173

    
2174
    block_s= block_w*block_w;
2175
    sum = pix_sum(current_data[0], stride, block_w);
2176
    l= (sum + block_s/2)/block_s;
2177
    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2178

    
2179
    block_s= block_w*block_w>>2;
2180
    sum = pix_sum(current_data[1], uvstride, block_w>>1);
2181
    cb= (sum + block_s/2)/block_s;
2182
//    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2183
    sum = pix_sum(current_data[2], uvstride, block_w>>1);
2184
    cr= (sum + block_s/2)/block_s;
2185
//    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2186

    
2187
    ic= s->c;
2188
    ic.bytestream_start=
2189
    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2190
    memcpy(i_state, s->block_state, sizeof(s->block_state));
2191
    if(level!=s->block_max_depth)
2192
        put_rac(&ic, &i_state[4 + s_context], 1);
2193
    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2194
    put_symbol(&ic, &i_state[32],  l-pl , 1);
2195
    put_symbol(&ic, &i_state[64], cb-pcb, 1);
2196
    put_symbol(&ic, &i_state[96], cr-pcr, 1);
2197
    i_len= ic.bytestream - ic.bytestream_start;
2198
    iscore += (s->lambda2*(i_len*8
2199
              + (ic.outstanding_count - s->c.outstanding_count)*8
2200
              + (-av_log2(ic.range)    + av_log2(s->c.range))
2201
             ))>>FF_LAMBDA_SHIFT;
2202

    
2203
//    assert(score==256*256*256*64-1);
2204
    assert(iscore < 255*255*256 + s->lambda2*10);
2205
    assert(iscore >= 0);
2206
    assert(l>=0 && l<=255);
2207
    assert(pl>=0 && pl<=255);
2208

    
2209
    if(level==0){
2210
        int varc= iscore >> 8;
2211
        int vard= score >> 8;
2212
        if (vard <= 64 || vard < varc)
2213
            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2214
        else
2215
            c->scene_change_score+= s->m.qscale;
2216
    }
2217

    
2218
    if(level!=s->block_max_depth){
2219
        put_rac(&s->c, &s->block_state[4 + s_context], 0);
2220
        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2221
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2222
        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2223
        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2224
        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2225

    
2226
        if(score2 < score && score2 < iscore)
2227
            return score2;
2228
    }
2229

    
2230
    if(iscore < score){
2231
        memcpy(pbbak, i_buffer, i_len);
2232
        s->c= ic;
2233
        s->c.bytestream_start= pbbak_start;
2234
        s->c.bytestream= pbbak + i_len;
2235
        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, BLOCK_INTRA);
2236
        memcpy(s->block_state, i_state, sizeof(s->block_state));
2237
        return iscore;
2238
    }else{
2239
        memcpy(pbbak, p_buffer, p_len);
2240
        s->c= pc;
2241
        s->c.bytestream_start= pbbak_start;
2242
        s->c.bytestream= pbbak + p_len;
2243
        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, 0);
2244
        memcpy(s->block_state, p_state, sizeof(s->block_state));
2245
        return score;
2246
    }
2247
}
2248

    
2249
static always_inline int same_block(BlockNode *a, BlockNode *b){
2250
    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2251
        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2252
    }else{
2253
        return !((a->mx - b->mx) | (a->my - b->my) | ((a->type ^ b->type)&BLOCK_INTRA));
2254
    }
2255
}
2256

    
2257
static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2258
    const int w= s->b_width  << s->block_max_depth;
2259
    const int rem_depth= s->block_max_depth - level;
2260
    const int index= (x + y*w) << rem_depth;
2261
    int trx= (x+1)<<rem_depth;
2262
    BlockNode *b= &s->block[index];
2263
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2264
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
2265
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2266
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2267
    int pl = left->color[0];
2268
    int pcb= left->color[1];
2269
    int pcr= left->color[2];
2270
    int pmx= mid_pred(left->mx, top->mx, tr->mx);
2271
    int pmy= mid_pred(left->my, top->my, tr->my);
2272
    int mx_context= av_log2(2*ABS(left->mx - top->mx));
2273
    int my_context= av_log2(2*ABS(left->my - top->my));
2274
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2275

    
2276
    if(s->keyframe){
2277
        set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, BLOCK_INTRA);
2278
        return;
2279
    }
2280

    
2281
    if(level!=s->block_max_depth){
2282
        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2283
            put_rac(&s->c, &s->block_state[4 + s_context], 1);
2284
        }else{
2285
            put_rac(&s->c, &s->block_state[4 + s_context], 0);
2286
            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2287
            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2288
            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2289
            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2290
            return;
2291
        }
2292
    }
2293
    if(b->type & BLOCK_INTRA){
2294
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2295
        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2296
        put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2297
        put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2298
        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, BLOCK_INTRA);
2299
    }else{
2300
        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2301
        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2302
        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2303
        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, 0);
2304
    }
2305
}
2306

    
2307
static void decode_q_branch(SnowContext *s, int level, int x, int y){
2308
    const int w= s->b_width << s->block_max_depth;
2309
    const int rem_depth= s->block_max_depth - level;
2310
    const int index= (x + y*w) << rem_depth;
2311
    int trx= (x+1)<<rem_depth;
2312
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
2313
    BlockNode *top   = y ? &s->block[index-w] : &null_block;
2314
    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2315
    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2316
    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2317

    
2318
    if(s->keyframe){
2319
        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, BLOCK_INTRA);
2320
        return;
2321
    }
2322

    
2323
    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2324
        int type;
2325
        int l = left->color[0];
2326
        int cb= left->color[1];
2327
        int cr= left->color[2];
2328
        int mx= mid_pred(left->mx, top->mx, tr->mx);
2329
        int my= mid_pred(left->my, top->my, tr->my);
2330
        int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 0*av_log2(2*ABS(tr->mx - top->mx));
2331
        int my_context= av_log2(2*ABS(left->my - top->my)) + 0*av_log2(2*ABS(tr->my - top->my));
2332

    
2333
        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2334

    
2335
        if(type){
2336
            l += get_symbol(&s->c, &s->block_state[32], 1);
2337
            cb+= get_symbol(&s->c, &s->block_state[64], 1);
2338
            cr+= get_symbol(&s->c, &s->block_state[96], 1);
2339
        }else{
2340
            mx+= get_symbol(&s->c, &s->block_state[128 + 32*mx_context], 1);
2341
            my+= get_symbol(&s->c, &s->block_state[128 + 32*my_context], 1);
2342
        }
2343
        set_blocks(s, level, x, y, l, cb, cr, mx, my, type);
2344
    }else{
2345
        decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2346
        decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2347
        decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2348
        decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2349
    }
2350
}
2351

    
2352
static void encode_blocks(SnowContext *s){
2353
    int x, y;
2354
    int w= s->b_width;
2355
    int h= s->b_height;
2356

    
2357
    if(s->avctx->me_method == ME_ITER && !s->keyframe)
2358
        iterative_me(s);
2359

    
2360
    for(y=0; y<h; y++){
2361
        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2362
            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2363
            return;
2364
        }
2365
        for(x=0; x<w; x++){
2366
            if(s->avctx->me_method == ME_ITER)
2367
                encode_q_branch2(s, 0, x, y);
2368
            else
2369
                encode_q_branch (s, 0, x, y);
2370
        }
2371
    }
2372
}
2373

    
2374
static void decode_blocks(SnowContext *s){
2375
    int x, y;
2376
    int w= s->b_width;
2377
    int h= s->b_height;
2378

    
2379
    for(y=0; y<h; y++){
2380
        for(x=0; x<w; x++){
2381
            decode_q_branch(s, 0, x, y);
2382
        }
2383
    }
2384
}
2385

    
2386
static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2387
    int x, y;
2388
START_TIMER
2389
    for(y=0; y < b_h+5; y++){
2390
        for(x=0; x < b_w; x++){
2391
            int a0= src[x    ];
2392
            int a1= src[x + 1];
2393
            int a2= src[x + 2];
2394
            int a3= src[x + 3];
2395
            int a4= src[x + 4];
2396
            int a5= src[x + 5];
2397
//            int am= 9*(a1+a2) - (a0+a3);
2398
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2399
//            int am= 18*(a2+a3) - 2*(a1+a4);
2400
//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2401
//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2402

    
2403
//            if(b_w==16) am= 8*(a1+a2);
2404

    
2405
            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2406
            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2407

    
2408
            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2409
            if(am&(~255)) am= ~(am>>31);
2410

    
2411
            tmp[x] = am;
2412

    
2413
/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2414
            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2415
            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2416
            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2417
        }
2418
        tmp += stride;
2419
        src += stride;
2420
    }
2421
    tmp -= (b_h+5)*stride;
2422

    
2423
    for(y=0; y < b_h; y++){
2424
        for(x=0; x < b_w; x++){
2425
            int a0= tmp[x + 0*stride];
2426
            int a1= tmp[x + 1*stride];
2427
            int a2= tmp[x + 2*stride];
2428
            int a3= tmp[x + 3*stride];
2429
            int a4= tmp[x + 4*stride];
2430
            int a5= tmp[x + 5*stride];
2431
            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2432
//            int am= 18*(a2+a3) - 2*(a1+a4);
2433
/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2434
            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2435

    
2436
//            if(b_w==16) am= 8*(a1+a2);
2437

    
2438
            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2439
            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2440

    
2441
            if(am&(~255)) am= ~(am>>31);
2442

    
2443
            dst[x] = am;
2444
/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2445
            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2446
            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2447
            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2448
        }
2449
        dst += stride;
2450
        tmp += stride;
2451
    }
2452
STOP_TIMER("mc_block")
2453
}
2454

    
2455
#define mca(dx,dy,b_w)\
2456
static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2457
    uint8_t tmp[stride*(b_w+5)];\
2458
    assert(h==b_w);\
2459
    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2460
}
2461

    
2462
mca( 0, 0,16)
2463
mca( 8, 0,16)
2464
mca( 0, 8,16)
2465
mca( 8, 8,16)
2466
mca( 0, 0,8)
2467
mca( 8, 0,8)
2468
mca( 0, 8,8)
2469
mca( 8, 8,8)
2470

    
2471
static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2472
    if(block->type & BLOCK_INTRA){
2473
        int x, y;
2474
        const int color = block->color[plane_index];
2475
        const int color4= color*0x01010101;
2476
        if(b_w==32){
2477
            for(y=0; y < b_h; y++){
2478
                *(uint32_t*)&dst[0 + y*stride]= color4;
2479
                *(uint32_t*)&dst[4 + y*stride]= color4;
2480
                *(uint32_t*)&dst[8 + y*stride]= color4;
2481
                *(uint32_t*)&dst[12+ y*stride]= color4;
2482
                *(uint32_t*)&dst[16+ y*stride]= color4;
2483
                *(uint32_t*)&dst[20+ y*stride]= color4;
2484
                *(uint32_t*)&dst[24+ y*stride]= color4;
2485
                *(uint32_t*)&dst[28+ y*stride]= color4;
2486
            }
2487
        }else if(b_w==16){
2488
            for(y=0; y < b_h; y++){
2489
                *(uint32_t*)&dst[0 + y*stride]= color4;
2490
                *(uint32_t*)&dst[4 + y*stride]= color4;
2491
                *(uint32_t*)&dst[8 + y*stride]= color4;
2492
                *(uint32_t*)&dst[12+ y*stride]= color4;
2493
            }
2494
        }else if(b_w==8){
2495
            for(y=0; y < b_h; y++){
2496
                *(uint32_t*)&dst[0 + y*stride]= color4;
2497
                *(uint32_t*)&dst[4 + y*stride]= color4;
2498
            }
2499
        }else if(b_w==4){
2500
            for(y=0; y < b_h; y++){
2501
                *(uint32_t*)&dst[0 + y*stride]= color4;
2502
            }
2503
        }else{
2504
            for(y=0; y < b_h; y++){
2505
                for(x=0; x < b_w; x++){
2506
                    dst[x + y*stride]= color;
2507
                }
2508
            }
2509
        }
2510
    }else{
2511
        const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2512
        int mx= block->mx*scale;
2513
        int my= block->my*scale;
2514
        const int dx= mx&15;
2515
        const int dy= my&15;
2516
        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2517
        sx += (mx>>4) - 2;
2518
        sy += (my>>4) - 2;
2519
        src += sx + sy*stride;
2520
        if(   (unsigned)sx >= w - b_w - 4
2521
           || (unsigned)sy >= h - b_h - 4){
2522
            ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2523
            src= tmp + MB_SIZE;
2524
        }
2525
        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2526
        assert(!(b_w&(b_w-1)));
2527
        assert(b_w>1 && b_h>1);
2528
        assert(tab_index>=0 && tab_index<4 || b_w==32);
2529
        if((dx&3) || (dy&3))
2530
            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2531
        else if(b_w==32){
2532
            int y;
2533
            for(y=0; y<b_h; y+=16){
2534
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2535
                s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2536
            }
2537
        }else if(b_w==b_h)
2538
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2539
        else if(b_w==2*b_h){
2540
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2541
            s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2542
        }else{
2543
            assert(2*b_w==b_h);
2544
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2545
            s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2546
        }
2547
    }
2548
}
2549

    
2550
//FIXME name clenup (b_w, block_w, b_width stuff)
2551
static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
2552
    DWTELEM * dst = NULL;
2553
    const int b_width = s->b_width  << s->block_max_depth;
2554
    const int b_height= s->b_height << s->block_max_depth;
2555
    const int b_stride= b_width;
2556
    BlockNode *lt= &s->block[b_x + b_y*b_stride];
2557
    BlockNode *rt= lt+1;
2558
    BlockNode *lb= lt+b_stride;
2559
    BlockNode *rb= lb+1;
2560
    uint8_t *block[4];
2561
    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2562
    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2563
    uint8_t *ptmp;
2564
    int x,y;
2565

    
2566
    if(b_x<0){
2567
        lt= rt;
2568
        lb= rb;
2569
    }else if(b_x + 1 >= b_width){
2570
        rt= lt;
2571
        rb= lb;
2572
    }
2573
    if(b_y<0){
2574
        lt= lb;
2575
        rt= rb;
2576
    }else if(b_y + 1 >= b_height){
2577
        lb= lt;
2578
        rb= rt;
2579
    }
2580

    
2581
    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2582
        obmc -= src_x;
2583
        b_w += src_x;
2584
        src_x=0;
2585
    }else if(src_x + b_w > w){
2586
        b_w = w - src_x;
2587
    }
2588
    if(src_y<0){
2589
        obmc -= src_y*obmc_stride;
2590
        b_h += src_y;
2591
        src_y=0;
2592
    }else if(src_y + b_h> h){
2593
        b_h = h - src_y;
2594
    }
2595

    
2596
    if(b_w<=0 || b_h<=0) return;
2597

    
2598
assert(src_stride > 2*MB_SIZE + 5);
2599
//    old_dst += src_x + src_y*dst_stride;
2600
    dst8+= src_x + src_y*src_stride;
2601
//    src += src_x + src_y*src_stride;
2602

    
2603
    ptmp= tmp + 3*tmp_step;
2604
    block[0]= ptmp;
2605
    ptmp+=tmp_step;
2606
    pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2607

    
2608
    if(same_block(lt, rt)){
2609
        block[1]= block[0];
2610
    }else{
2611
        block[1]= ptmp;
2612
        ptmp+=tmp_step;
2613
        pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2614
    }
2615

    
2616
    if(same_block(lt, lb)){
2617
        block[2]= block[0];
2618
    }else if(same_block(rt, lb)){
2619
        block[2]= block[1];
2620
    }else{
2621
        block[2]= ptmp;
2622
        ptmp+=tmp_step;
2623
        pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2624
    }
2625

    
2626
    if(same_block(lt, rb) ){
2627
        block[3]= block[0];
2628
    }else if(same_block(rt, rb)){
2629
        block[3]= block[1];
2630
    }else if(same_block(lb, rb)){
2631
        block[3]= block[2];
2632
    }else{
2633
        block[3]= ptmp;
2634
        pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2635
    }
2636
#if 0
2637
    for(y=0; y<b_h; y++){
2638
        for(x=0; x<b_w; x++){
2639
            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2640
            if(add) dst[x + y*dst_stride] += v;
2641
            else    dst[x + y*dst_stride] -= v;
2642
        }
2643
    }
2644
    for(y=0; y<b_h; y++){
2645
        uint8_t *obmc2= obmc + (obmc_stride>>1);
2646
        for(x=0; x<b_w; x++){
2647
            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2648
            if(add) dst[x + y*dst_stride] += v;
2649
            else    dst[x + y*dst_stride] -= v;
2650
        }
2651
    }
2652
    for(y=0; y<b_h; y++){
2653
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2654
        for(x=0; x<b_w; x++){
2655
            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2656
            if(add) dst[x + y*dst_stride] += v;
2657
            else    dst[x + y*dst_stride] -= v;
2658
        }
2659
    }
2660
    for(y=0; y<b_h; y++){
2661
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2662
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2663
        for(x=0; x<b_w; x++){
2664
            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2665
            if(add) dst[x + y*dst_stride] += v;
2666
            else    dst[x + y*dst_stride] -= v;
2667
        }
2668
    }
2669
#else
2670
{
2671

    
2672
    START_TIMER
2673

    
2674
    for(y=0; y<b_h; y++){
2675
        //FIXME ugly missue of obmc_stride
2676
        uint8_t *obmc1= obmc + y*obmc_stride;
2677
        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2678
        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2679
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2680
        dst = slice_buffer_get_line(sb, src_y + y);
2681
        for(x=0; x<b_w; x++){
2682
            int v=   obmc1[x] * block[3][x + y*src_stride]
2683
                    +obmc2[x] * block[2][x + y*src_stride]
2684
                    +obmc3[x] * block[1][x + y*src_stride]
2685
                    +obmc4[x] * block[0][x + y*src_stride];
2686

    
2687
            v <<= 8 - LOG2_OBMC_MAX;
2688
            if(FRAC_BITS != 8){
2689
                v += 1<<(7 - FRAC_BITS);
2690
                v >>= 8 - FRAC_BITS;
2691
            }
2692
            if(add){
2693
//                v += old_dst[x + y*dst_stride];
2694
                v += dst[x + src_x];
2695
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2696
                if(v&(~255)) v= ~(v>>31);
2697
                dst8[x + y*src_stride] = v;
2698
            }else{
2699
//                old_dst[x + y*dst_stride] -= v;
2700
                dst[x + src_x] -= v;
2701
            }
2702
        }
2703
    }
2704
        STOP_TIMER("Inner add y block")
2705
}
2706
#endif
2707
}
2708

    
2709
//FIXME name clenup (b_w, block_w, b_width stuff)
2710
static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2711
    const int b_width = s->b_width  << s->block_max_depth;
2712
    const int b_height= s->b_height << s->block_max_depth;
2713
    const int b_stride= b_width;
2714
    BlockNode *lt= &s->block[b_x + b_y*b_stride];
2715
    BlockNode *rt= lt+1;
2716
    BlockNode *lb= lt+b_stride;
2717
    BlockNode *rb= lb+1;
2718
    uint8_t *block[4];
2719
    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2720
    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2721
    uint8_t *ptmp;
2722
    int x,y;
2723

    
2724
    if(b_x<0){
2725
        lt= rt;
2726
        lb= rb;
2727
    }else if(b_x + 1 >= b_width){
2728
        rt= lt;
2729
        rb= lb;
2730
    }
2731
    if(b_y<0){
2732
        lt= lb;
2733
        rt= rb;
2734
    }else if(b_y + 1 >= b_height){
2735
        lb= lt;
2736
        rb= rt;
2737
    }
2738

    
2739
    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2740
        obmc -= src_x;
2741
        b_w += src_x;
2742
        if(!offset_dst)
2743
            dst -= src_x;
2744
        src_x=0;
2745
    }else if(src_x + b_w > w){
2746
        b_w = w - src_x;
2747
    }
2748
    if(src_y<0){
2749
        obmc -= src_y*obmc_stride;
2750
        b_h += src_y;
2751
        if(!offset_dst)
2752
            dst -= src_y*dst_stride;
2753
        src_y=0;
2754
    }else if(src_y + b_h> h){
2755
        b_h = h - src_y;
2756
    }
2757

    
2758
    if(b_w<=0 || b_h<=0) return;
2759

    
2760
assert(src_stride > 2*MB_SIZE + 5);
2761
    if(offset_dst)
2762
        dst += src_x + src_y*dst_stride;
2763
    dst8+= src_x + src_y*src_stride;
2764
//    src += src_x + src_y*src_stride;
2765

    
2766
    ptmp= tmp + 3*tmp_step;
2767
    block[0]= ptmp;
2768
    ptmp+=tmp_step;
2769
    pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2770

    
2771
    if(same_block(lt, rt)){
2772
        block[1]= block[0];
2773
    }else{
2774
        block[1]= ptmp;
2775
        ptmp+=tmp_step;
2776
        pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2777
    }
2778

    
2779
    if(same_block(lt, lb)){
2780
        block[2]= block[0];
2781
    }else if(same_block(rt, lb)){
2782
        block[2]= block[1];
2783
    }else{
2784
        block[2]= ptmp;
2785
        ptmp+=tmp_step;
2786
        pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2787
    }
2788

    
2789
    if(same_block(lt, rb) ){
2790
        block[3]= block[0];
2791
    }else if(same_block(rt, rb)){
2792
        block[3]= block[1];
2793
    }else if(same_block(lb, rb)){
2794
        block[3]= block[2];
2795
    }else{
2796
        block[3]= ptmp;
2797
        pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2798
    }
2799
#if 0
2800
    for(y=0; y<b_h; y++){
2801
        for(x=0; x<b_w; x++){
2802
            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2803
            if(add) dst[x + y*dst_stride] += v;
2804
            else    dst[x + y*dst_stride] -= v;
2805
        }
2806
    }
2807
    for(y=0; y<b_h; y++){
2808
        uint8_t *obmc2= obmc + (obmc_stride>>1);
2809
        for(x=0; x<b_w; x++){
2810
            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2811
            if(add) dst[x + y*dst_stride] += v;
2812
            else    dst[x + y*dst_stride] -= v;
2813
        }
2814
    }
2815
    for(y=0; y<b_h; y++){
2816
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2817
        for(x=0; x<b_w; x++){
2818
            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2819
            if(add) dst[x + y*dst_stride] += v;
2820
            else    dst[x + y*dst_stride] -= v;
2821
        }
2822
    }
2823
    for(y=0; y<b_h; y++){
2824
        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2825
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2826
        for(x=0; x<b_w; x++){
2827
            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2828
            if(add) dst[x + y*dst_stride] += v;
2829
            else    dst[x + y*dst_stride] -= v;
2830
        }
2831
    }
2832
#else
2833
    for(y=0; y<b_h; y++){
2834
        //FIXME ugly missue of obmc_stride
2835
        uint8_t *obmc1= obmc + y*obmc_stride;
2836
        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2837
        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2838
        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2839
        for(x=0; x<b_w; x++){
2840
            int v=   obmc1[x] * block[3][x + y*src_stride]
2841
                    +obmc2[x] * block[2][x + y*src_stride]
2842
                    +obmc3[x] * block[1][x + y*src_stride]
2843
                    +obmc4[x] * block[0][x + y*src_stride];
2844

    
2845
            v <<= 8 - LOG2_OBMC_MAX;
2846
            if(FRAC_BITS != 8){
2847
                v += 1<<(7 - FRAC_BITS);
2848
                v >>= 8 - FRAC_BITS;
2849
            }
2850
            if(add){
2851
                v += dst[x + y*dst_stride];
2852
                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2853
                if(v&(~255)) v= ~(v>>31);
2854
                dst8[x + y*src_stride] = v;
2855
            }else{
2856
                dst[x + y*dst_stride] -= v;
2857
            }
2858
        }
2859
    }
2860
#endif
2861
}
2862

    
2863
static always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2864
    Plane *p= &s->plane[plane_index];
2865
    const int mb_w= s->b_width  << s->block_max_depth;
2866
    const int mb_h= s->b_height << s->block_max_depth;
2867
    int x, y, mb_x;
2868
    int block_size = MB_SIZE >> s->block_max_depth;
2869
    int block_w    = plane_index ? block_size/2 : block_size;
2870
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2871
    int obmc_stride= plane_index ? block_size : 2*block_size;
2872
    int ref_stride= s->current_picture.linesize[plane_index];
2873
    uint8_t *ref  = s->last_picture.data[plane_index];
2874
    uint8_t *dst8= s->current_picture.data[plane_index];
2875
    int w= p->width;
2876
    int h= p->height;
2877
    START_TIMER
2878

    
2879
    if(s->keyframe || (s->avctx->debug&512)){
2880
        if(mb_y==mb_h)
2881
            return;
2882

    
2883
        if(add){
2884
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2885
            {
2886
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2887
                DWTELEM * line = sb->line[y];
2888
                for(x=0; x<w; x++)
2889
                {
2890
//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2891
                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2892
                    v >>= FRAC_BITS;
2893
                    if(v&(~255)) v= ~(v>>31);
2894
                    dst8[x + y*ref_stride]= v;
2895
                }
2896
            }
2897
        }else{
2898
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2899
            {
2900
//                DWTELEM * line = slice_buffer_get_line(sb, y);
2901
                DWTELEM * line = sb->line[y];
2902
                for(x=0; x<w; x++)
2903
                {
2904
                    line[x] -= 128 << FRAC_BITS;
2905
//                    buf[x + y*w]-= 128<<FRAC_BITS;
2906
                }
2907
            }
2908
        }
2909

    
2910
        return;
2911
    }
2912

    
2913
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2914
            START_TIMER
2915

    
2916
            add_yblock_buffered(s, sb, old_buffer, dst8, ref, obmc,
2917
                       block_w*mb_x - block_w/2,
2918
                       block_w*mb_y - block_w/2,
2919
                       block_w, block_w,
2920
                       w, h,
2921
                       w, ref_stride, obmc_stride,
2922
                       mb_x - 1, mb_y - 1,
2923
                       add, plane_index);
2924

    
2925
            STOP_TIMER("add_yblock")
2926
        }
2927

    
2928
    STOP_TIMER("predict_slice")
2929
}
2930

    
2931
static always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2932
    Plane *p= &s->plane[plane_index];
2933
    const int mb_w= s->b_width  << s->block_max_depth;
2934
    const int mb_h= s->b_height << s->block_max_depth;
2935
    int x, y, mb_x;
2936
    int block_size = MB_SIZE >> s->block_max_depth;
2937
    int block_w    = plane_index ? block_size/2 : block_size;
2938
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2939
    const int obmc_stride= plane_index ? block_size : 2*block_size;
2940
    int ref_stride= s->current_picture.linesize[plane_index];
2941
    uint8_t *ref  = s->last_picture.data[plane_index];
2942
    uint8_t *dst8= s->current_picture.data[plane_index];
2943
    int w= p->width;
2944
    int h= p->height;
2945
    START_TIMER
2946

    
2947
    if(s->keyframe || (s->avctx->debug&512)){
2948
        if(mb_y==mb_h)
2949
            return;
2950

    
2951
        if(add){
2952
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2953
                for(x=0; x<w; x++){
2954
                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2955
                    v >>= FRAC_BITS;
2956
                    if(v&(~255)) v= ~(v>>31);
2957
                    dst8[x + y*ref_stride]= v;
2958
                }
2959
            }
2960
        }else{
2961
            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2962
                for(x=0; x<w; x++){
2963
                    buf[x + y*w]-= 128<<FRAC_BITS;
2964
                }
2965
            }
2966
        }
2967

    
2968
        return;
2969
    }
2970

    
2971
        for(mb_x=0; mb_x<=mb_w; mb_x++){
2972
            START_TIMER
2973

    
2974
            add_yblock(s, buf, dst8, ref, obmc,
2975
                       block_w*mb_x - block_w/2,
2976
                       block_w*mb_y - block_w/2,
2977
                       block_w, block_w,
2978
                       w, h,
2979
                       w, ref_stride, obmc_stride,
2980
                       mb_x - 1, mb_y - 1,
2981
                       add, 1, plane_index);
2982

    
2983
            STOP_TIMER("add_yblock")
2984
        }
2985

    
2986
    STOP_TIMER("predict_slice")
2987
}
2988

    
2989
static always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2990
    const int mb_h= s->b_height << s->block_max_depth;
2991
    int mb_y;
2992
    for(mb_y=0; mb_y<=mb_h; mb_y++)
2993
        predict_slice(s, buf, plane_index, add, mb_y);
2994
}
2995

    
2996
static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2997
    int i, x2, y2;
2998
    Plane *p= &s->plane[plane_index];
2999
    const int block_size = MB_SIZE >> s->block_max_depth;
3000
    const int block_w    = plane_index ? block_size/2 : block_size;
3001
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3002
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3003
    const int ref_stride= s->current_picture.linesize[plane_index];
3004
    uint8_t *ref= s->   last_picture.data[plane_index];
3005
    uint8_t *src= s-> input_picture.data[plane_index];
3006
    DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
3007
    const int b_stride = s->b_width << s->block_max_depth;
3008
    const int w= p->width;
3009
    const int h= p->height;
3010
    int index= mb_x + mb_y*b_stride;
3011
    BlockNode *b= &s->block[index];
3012
    BlockNode backup= *b;
3013
    int ab=0;
3014
    int aa=0;
3015

    
3016
    b->type|= BLOCK_INTRA;
3017
    b->color[plane_index]= 0;
3018
    memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
3019

    
3020
    for(i=0; i<4; i++){
3021
        int mb_x2= mb_x + (i &1) - 1;
3022
        int mb_y2= mb_y + (i>>1) - 1;
3023
        int x= block_w*mb_x2 + block_w/2;
3024
        int y= block_w*mb_y2 + block_w/2;
3025

    
3026
        add_yblock(s, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, ref, obmc, 
3027
                    x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
3028

    
3029
        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
3030
            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
3031
                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
3032
                int obmc_v= obmc[index];
3033
                int d;
3034
                if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
3035
                if(x<0) obmc_v += obmc[index + block_w];
3036
                if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
3037
                if(x+block_w>w) obmc_v += obmc[index - block_w];
3038
                //FIXME precalc this or simplify it somehow else
3039

    
3040
                d = -dst[index] + (1<<(FRAC_BITS-1));
3041
                dst[index] = d;
3042
                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
3043
                aa += obmc_v * obmc_v; //FIXME precalclate this
3044
            }
3045
        }
3046
    }
3047
    *b= backup;
3048

    
3049
    return clip(((ab<<6) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
3050
}
3051

    
3052
static inline int get_block_bits(SnowContext *s, int x, int y, int w){
3053
    const int b_stride = s->b_width << s->block_max_depth;
3054
    const int b_height = s->b_height<< s->block_max_depth;
3055
    int index= x + y*b_stride;
3056
    BlockNode *b     = &s->block[index];
3057
    BlockNode *left  = x ? &s->block[index-1] : &null_block;
3058
    BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
3059
    BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
3060
    BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
3061
    int dmx, dmy;
3062
//  int mx_context= av_log2(2*ABS(left->mx - top->mx));
3063
//  int my_context= av_log2(2*ABS(left->my - top->my));
3064

    
3065
    if(x<0 || x>=b_stride || y>=b_height)
3066
        return 0;
3067
    dmx= b->mx - mid_pred(left->mx, top->mx, tr->mx);
3068
    dmy= b->my - mid_pred(left->my, top->my, tr->my);
3069
/*
3070
1            0      0
3071
01X          1-2    1
3072
001XX        3-6    2-3
3073
0001XXX      7-14   4-7
3074
00001XXXX   15-30   8-15
3075
*/
3076
//FIXME try accurate rate
3077
//FIXME intra and inter predictors if surrounding blocks arent the same type
3078
    if(b->type & BLOCK_INTRA){
3079
        return 3+2*( av_log2(2*ABS(left->color[0] - b->color[0]))
3080
                   + av_log2(2*ABS(left->color[1] - b->color[1]))
3081
                   + av_log2(2*ABS(left->color[2] - b->color[2])));
3082
    }else
3083
        return 2*(1 + av_log2(2*ABS(dmx))
3084
                    + av_log2(2*ABS(dmy))); //FIXME kill the 2* can be merged in lambda
3085
}
3086

    
3087
static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
3088
    Plane *p= &s->plane[plane_index];
3089
    const int block_size = MB_SIZE >> s->block_max_depth;
3090
    const int block_w    = plane_index ? block_size/2 : block_size;
3091
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3092
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3093
    const int ref_stride= s->current_picture.linesize[plane_index];
3094
    uint8_t *ref= s->   last_picture.data[plane_index];
3095
    uint8_t *dst= s->current_picture.data[plane_index];
3096
    uint8_t *src= s->  input_picture.data[plane_index];
3097
    DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
3098
    uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
3099
    uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
3100
    const int b_stride = s->b_width << s->block_max_depth;
3101
    const int b_height = s->b_height<< s->block_max_depth;
3102
    const int w= p->width;
3103
    const int h= p->height;
3104
    int distortion;
3105
    int rate= 0;
3106
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3107
    int sx= block_w*mb_x - block_w/2;
3108
    int sy= block_w*mb_y - block_w/2;
3109
    const int x0= FFMAX(0,-sx);
3110
    const int y0= FFMAX(0,-sy);
3111
    const int x1= FFMIN(block_w*2, w-sx);
3112
    const int y1= FFMIN(block_w*2, h-sy);
3113
    int i,x,y;
3114

    
3115
    pred_block(s, cur, ref, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
3116

    
3117
    for(y=y0; y<y1; y++){
3118
        const uint8_t *obmc1= obmc_edged + y*obmc_stride;
3119
        const DWTELEM *pred1 = pred + y*obmc_stride;
3120
        uint8_t *cur1 = cur + y*ref_stride;
3121
        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
3122
        for(x=x0; x<x1; x++){
3123
            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
3124
            v = (v + pred1[x]) >> FRAC_BITS;
3125
            if(v&(~255)) v= ~(v>>31);
3126
            dst1[x] = v;
3127
        }
3128
    }
3129

    
3130
    //FIXME sad/ssd can be broken up, but wavelet cmp should be one 32x32 block
3131
    if(block_w==16){
3132
        distortion = 0;
3133
        for(i=0; i<4; i++){
3134
            int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3135
            distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3136
        }
3137
    }else{
3138
        assert(block_w==8);
3139
        distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3140
    }
3141

    
3142
    if(plane_index==0){
3143
        for(i=0; i<4; i++){
3144
/* ..RRr
3145
 * .RXx.
3146
 * rxx..
3147
 */
3148
            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3149
        }
3150
    }
3151
    return distortion + rate*penalty_factor;
3152
}
3153

    
3154
static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3155
    int i, y2;
3156
    Plane *p= &s->plane[plane_index];
3157
    const int block_size = MB_SIZE >> s->block_max_depth;
3158
    const int block_w    = plane_index ? block_size/2 : block_size;
3159
    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3160
    const int obmc_stride= plane_index ? block_size : 2*block_size;
3161
    const int ref_stride= s->current_picture.linesize[plane_index];
3162
    uint8_t *ref= s->   last_picture.data[plane_index];
3163
    uint8_t *dst= s->current_picture.data[plane_index];
3164
    uint8_t *src= s-> input_picture.data[plane_index];
3165
    const static DWTELEM zero_dst[4096]; //FIXME
3166
    const int b_stride = s->b_width << s->block_max_depth;
3167
    const int b_height = s->b_height<< s->block_max_depth;
3168
    const int w= p->width;
3169
    const int h= p->height;
3170
    int distortion= 0;
3171
    int rate= 0;
3172
    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3173

    
3174
    for(i=0; i<9; i++){
3175
        int mb_x2= mb_x + (i%3) - 1;
3176
        int mb_y2= mb_y + (i/3) - 1;
3177
        int x= block_w*mb_x2 + block_w/2;
3178
        int y= block_w*mb_y2 + block_w/2;
3179

    
3180
        add_yblock(s, zero_dst, dst, ref, obmc, 
3181
                   x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3182

    
3183
        //FIXME find a cleaner/simpler way to skip the outside stuff
3184
        for(y2= y; y2<0; y2++)
3185
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3186
        for(y2= h; y2<y+block_w; y2++)
3187
            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3188
        if(x<0){
3189
            for(y2= y; y2<y+block_w; y2++)
3190
                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3191
        }
3192
        if(x+block_w > w){
3193
            for(y2= y; y2<y+block_w; y2++)
3194
                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3195
        }
3196

    
3197
        assert(block_w== 8 || block_w==16);
3198
        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3199
    }
3200

    
3201
    if(plane_index==0){
3202
        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3203
        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3204

    
3205
/* ..RRRr
3206
 * .RXXx.
3207
 * .RXXx.
3208
 * rxxx.
3209
 */
3210
        if(merged)
3211
            rate = get_block_bits(s, mb_x, mb_y, 2);
3212
        for(i=merged?4:0; i<9; i++){
3213
            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3214
            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3215
        }
3216
    }
3217
    return distortion + rate*penalty_factor;
3218
}
3219

    
3220
static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3221
    const int b_stride= s->b_width << s->block_max_depth;
3222
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3223
    BlockNode backup= *block;
3224
    int rd, index, value;
3225

    
3226
    assert(mb_x>=0 && mb_y>=0);
3227
    assert(mb_x<b_stride);
3228

    
3229
    if(intra){
3230
        block->color[0] = p[0];
3231
        block->color[1] = p[1];
3232
        block->color[2] = p[2];
3233
        block->type |= BLOCK_INTRA;
3234
    }else{
3235
        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3236
        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6);
3237
        if(s->me_cache[index] == value)
3238
            return 0;
3239
        s->me_cache[index]= value;
3240

    
3241
        block->mx= p[0];
3242
        block->my= p[1];
3243
        block->type &= ~BLOCK_INTRA;
3244
    }
3245

    
3246
    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3247

    
3248
//FIXME chroma
3249
    if(rd < *best_rd){
3250
        *best_rd= rd;
3251
        return 1;
3252
    }else{
3253
        *block= backup;
3254
        return 0;
3255
    }
3256
}
3257

    
3258
/* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3259
static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int intra, const uint8_t *obmc_edged, int *best_rd){
3260
    int p[2] = {p0, p1};
3261
    return check_block(s, mb_x, mb_y, p, intra, obmc_edged, best_rd);
3262
}
3263

    
3264
static always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int *best_rd){
3265
    const int b_stride= s->b_width << s->block_max_depth;
3266
    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3267
    BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3268
    int rd, index, value;
3269

    
3270
    assert(mb_x>=0 && mb_y>=0);
3271
    assert(mb_x<b_stride);
3272
    assert(((mb_x|mb_y)&1) == 0);
3273

    
3274
    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3275
    value= s->me_cache_generation + (p0>>10) + (p1<<6);
3276
    if(s->me_cache[index] == value)
3277
        return 0;
3278
    s->me_cache[index]= value;
3279

    
3280
    block->mx= p0;
3281
    block->my= p1;
3282
    block->type &= ~BLOCK_INTRA;
3283
    block[1]= block[b_stride]= block[b_stride+1]= *block;
3284

    
3285
    rd= get_4block_rd(s, mb_x, mb_y, 0);
3286

    
3287
//FIXME chroma
3288
    if(rd < *best_rd){
3289
        *best_rd= rd;
3290
        return 1;
3291
    }else{
3292
        block[0]= backup[0];
3293
        block[1]= backup[1];
3294
        block[b_stride]= backup[2];
3295
        block[b_stride+1]= backup[3];
3296
        return 0;
3297
    }
3298
}
3299

    
3300
static void iterative_me(SnowContext *s){
3301
    int pass, mb_x, mb_y;
3302
    const int b_width = s->b_width  << s->block_max_depth;
3303
    const int b_height= s->b_height << s->block_max_depth;
3304
    const int b_stride= b_width;
3305
    int color[3];
3306
    const int first_crc_pass= 12;
3307
    uint32_t crcs[50];
3308

    
3309
    for(pass=0; pass<50; pass++){
3310
        int change= 0;
3311

    
3312
        for(mb_y= 0; mb_y<b_height; mb_y++){
3313
            for(mb_x= 0; mb_x<b_width; mb_x++){
3314
                int dia_change, i, j;
3315
                int best_rd= INT_MAX;
3316
                BlockNode backup;
3317
                const int index= mb_x + mb_y * b_stride;
3318
                BlockNode *block= &s->block[index];
3319
                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : &null_block;
3320
                BlockNode *lb = mb_x                              ? &s->block[index         -1] : &null_block;
3321
                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : &null_block;
3322
                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : &null_block;
3323
                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : &null_block;
3324
                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : &null_block;
3325
                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : &null_block;
3326
                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : &null_block;
3327
                const int b_w= (MB_SIZE >> s->block_max_depth);
3328
                uint8_t obmc_edged[b_w*2][b_w*2];
3329

    
3330
                if(pass && (block->type & BLOCK_OPT))
3331
                    continue;
3332
                block->type |= BLOCK_OPT;
3333

    
3334
                backup= *block;
3335

    
3336
                if(!s->me_cache_generation)
3337
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3338
                s->me_cache_generation += 1<<22;
3339

    
3340
                //FIXME precalc
3341
                {
3342
                    int x, y;
3343
                    memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3344
                    if(mb_x==0)
3345
                        for(y=0; y<b_w*2; y++)
3346
                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3347
                    if(mb_x==b_stride-1)
3348
                        for(y=0; y<b_w*2; y++)
3349
                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3350
                    if(mb_y==0){
3351
                        for(x=0; x<b_w*2; x++)
3352
                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
3353
                        for(y=1; y<b_w; y++)
3354
                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3355
                    }
3356
                    if(mb_y==b_height-1){
3357
                        for(x=0; x<b_w*2; x++)
3358
                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3359
                        for(y=b_w; y<b_w*2-1; y++)
3360
                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3361
                    }
3362
                }
3363

    
3364
                //skip stuff outside the picture
3365
                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3366
                {
3367
                    uint8_t *src= s->  input_picture.data[0];
3368
                    uint8_t *dst= s->current_picture.data[0];
3369
                    const int stride= s->current_picture.linesize[0];
3370
                    const int block_w= MB_SIZE >> s->block_max_depth;
3371
                    const int sx= block_w*mb_x - block_w/2;
3372
                    const int sy= block_w*mb_y - block_w/2;
3373
                    const int w= s->plane[0].width;
3374
                    const int h= s->plane[0].height;
3375
                    int y;
3376

    
3377
                    for(y=sy; y<0; y++)
3378
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3379
                    for(y=h; y<sy+block_w*2; y++)
3380
                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3381
                    if(sx<0){
3382
                        for(y=sy; y<sy+block_w*2; y++)
3383
                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3384
                    }
3385
                    if(sx+block_w*2 > w){
3386
                        for(y=sy; y<sy+block_w*2; y++)
3387
                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3388
                    }
3389
                }
3390

    
3391
                // intra(black) = neighbors' contribution to the current block
3392
                for(i=0; i<3; i++)
3393
                    color[i]= get_dc(s, mb_x, mb_y, i);
3394

    
3395
                // get previous score (cant be cached due to OBMC)
3396
                check_block_inter(s, mb_x, mb_y, block->mx, block->my, 0, *obmc_edged, &best_rd);
3397
                check_block_inter(s, mb_x, mb_y, 0, 0, 0, *obmc_edged, &best_rd);
3398
                check_block_inter(s, mb_x, mb_y, tb->mx, tb->my, 0, *obmc_edged, &best_rd);
3399
                check_block_inter(s, mb_x, mb_y, lb->mx, lb->my, 0, *obmc_edged, &best_rd);
3400
                check_block_inter(s, mb_x, mb_y, rb->mx, rb->my, 0, *obmc_edged, &best_rd);
3401
                check_block_inter(s, mb_x, mb_y, bb->mx, bb->my, 0, *obmc_edged, &best_rd);
3402

    
3403
                /* fullpel ME */
3404
                //FIXME avoid subpel interpol / round to nearest integer
3405
                do{
3406
                    dia_change=0;
3407
                    for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3408
                        for(j=0; j<i; j++){
3409
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), 0, *obmc_edged, &best_rd);
3410
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), 0, *obmc_edged, &best_rd);
3411
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), 0, *obmc_edged, &best_rd);
3412
                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), 0, *obmc_edged, &best_rd);
3413
                        }
3414
                    }
3415
                }while(dia_change);
3416
                /* subpel ME */
3417
                do{
3418
                    static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3419
                    dia_change=0;
3420
                    for(i=0; i<8; i++)
3421
                        dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], 0, *obmc_edged, &best_rd);
3422
                }while(dia_change);
3423
                //FIXME or try the standard 2 pass qpel or similar
3424
#if 1
3425
                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3426
                //FIXME RD style color selection
3427
#endif
3428
                if(!same_block(block, &backup)){
3429
                    if(tb != &null_block) tb ->type &= ~BLOCK_OPT;
3430
                    if(lb != &null_block) lb ->type &= ~BLOCK_OPT;
3431
                    if(rb != &null_block) rb ->type &= ~BLOCK_OPT;
3432
                    if(bb != &null_block) bb ->type &= ~BLOCK_OPT;
3433
                    if(tlb!= &null_block) tlb->type &= ~BLOCK_OPT;
3434
                    if(trb!= &null_block) trb->type &= ~BLOCK_OPT;
3435
                    if(blb!= &null_block) blb->type &= ~BLOCK_OPT;
3436
                    if(brb!= &null_block) brb->type &= ~BLOCK_OPT;
3437
                    change ++;
3438
                }
3439
            }
3440
        }
3441
        av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3442

    
3443
        if(pass >= first_crc_pass){
3444
            int i;
3445
            //FIXME can we hash just the blocks that were analysed?
3446
            crcs[pass]= crc32(crc32(0,NULL,0), (void*)s->block, b_stride*b_height*sizeof(BlockNode));
3447
            for(i=pass-1; i>=first_crc_pass; i--){
3448
                if(crcs[i] == crcs[pass]){
3449
                    change= 0;
3450
                    break;
3451
                }
3452
            }
3453
        }
3454
        if(!change)
3455
            break;
3456
    }
3457

    
3458
    if(s->block_max_depth == 1){
3459
        int change= 0;
3460
        for(mb_y= 0; mb_y<b_height; mb_y+=2){
3461
            for(mb_x= 0; mb_x<b_width; mb_x+=2){
3462
                int dia_change, i, j;
3463
                int best_rd, init_rd;
3464
                const int index= mb_x + mb_y * b_stride;
3465
                BlockNode *b[4];
3466

    
3467
                b[0]= &s->block[index];
3468
                b[1]= b[0]+1;
3469
                b[2]= b[0]+b_stride;
3470
                b[3]= b[2]+1;
3471
                if(same_block(b[0], b[1]) &&
3472
                   same_block(b[0], b[2]) &&
3473
                   same_block(b[0], b[3]))
3474
                    continue;
3475

    
3476
                if(!s->me_cache_generation)
3477
                    memset(s->me_cache, 0, sizeof(s->me_cache));
3478
                s->me_cache_generation += 1<<22;
3479

    
3480
                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3481

    
3482
                check_4block_inter(s, mb_x, mb_y,
3483
                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3484
                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, &best_rd);
3485

    
3486
                for(i=0; i<4; i++)
3487
                    if(!(b[i]->type&BLOCK_INTRA))
3488
                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, &best_rd);
3489

    
3490
                if(init_rd != best_rd)
3491
                    change++;
3492
            }
3493
        }
3494
        av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3495
    }
3496
}
3497

    
3498
static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3499
    const int level= b->level;
3500
    const int w= b->width;
3501
    const int h= b->height;
3502
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3503
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3504
    int x,y, thres1, thres2;
3505
//    START_TIMER
3506

    
3507
    if(s->qlog == LOSSLESS_QLOG) return;
3508

    
3509
    bias= bias ? 0 : (3*qmul)>>3;
3510
    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3511
    thres2= 2*thres1;
3512

    
3513
    if(!bias){
3514
        for(y=0; y<h; y++){
3515
            for(x=0; x<w; x++){
3516
                int i= src[x + y*stride];
3517

    
3518
                if((unsigned)(i+thres1) > thres2){
3519
                    if(i>=0){
3520
                        i<<= QEXPSHIFT;
3521
                        i/= qmul; //FIXME optimize
3522
                        src[x + y*stride]=  i;
3523
                    }else{
3524
                        i= -i;
3525
                        i<<= QEXPSHIFT;
3526
                        i/= qmul; //FIXME optimize
3527
                        src[x + y*stride]= -i;
3528
                    }
3529
                }else
3530
                    src[x + y*stride]= 0;
3531
            }
3532
        }
3533
    }else{
3534
        for(y=0; y<h; y++){
3535
            for(x=0; x<w; x++){
3536
                int i= src[x + y*stride];
3537

    
3538
                if((unsigned)(i+thres1) > thres2){
3539
                    if(i>=0){
3540
                        i<<= QEXPSHIFT;
3541
                        i= (i + bias) / qmul; //FIXME optimize
3542
                        src[x + y*stride]=  i;
3543
                    }else{
3544
                        i= -i;
3545
                        i<<= QEXPSHIFT;
3546
                        i= (i + bias) / qmul; //FIXME optimize
3547
                        src[x + y*stride]= -i;
3548
                    }
3549
                }else
3550
                    src[x + y*stride]= 0;
3551
            }
3552
        }
3553
    }
3554
    if(level+1 == s->spatial_decomposition_count){
3555
//        STOP_TIMER("quantize")
3556
    }
3557
}
3558

    
3559
static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3560
    const int w= b->width;
3561
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3562
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3563
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3564
    int x,y;
3565
    START_TIMER
3566

    
3567
    if(s->qlog == LOSSLESS_QLOG) return;
3568

    
3569
    for(y=start_y; y<end_y; y++){
3570
//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3571
        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3572
        for(x=0; x<w; x++){
3573
            int i= line[x];
3574
            if(i<0){
3575
                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3576
            }else if(i>0){
3577
                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3578
            }
3579
        }
3580
    }
3581
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3582
        STOP_TIMER("dquant")
3583
    }
3584
}
3585

    
3586
static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3587
    const int w= b->width;
3588
    const int h= b->height;
3589
    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3590
    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3591
    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3592
    int x,y;
3593
    START_TIMER
3594

    
3595
    if(s->qlog == LOSSLESS_QLOG) return;
3596

    
3597
    for(y=0; y<h; y++){
3598
        for(x=0; x<w; x++){
3599
            int i= src[x + y*stride];
3600
            if(i<0){
3601
                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3602
            }else if(i>0){
3603
                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3604
            }
3605
        }
3606
    }
3607
    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3608
        STOP_TIMER("dquant")
3609
    }
3610
}
3611

    
3612
static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3613
    const int w= b->width;
3614
    const int h= b->height;
3615
    int x,y;
3616

    
3617
    for(y=h-1; y>=0; y--){
3618
        for(x=w-1; x>=0; x--){
3619
            int i= x + y*stride;
3620

    
3621
            if(x){
3622
                if(use_median){
3623
                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3624
                    else  src[i] -= src[i - 1];
3625
                }else{
3626
                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3627
                    else  src[i] -= src[i - 1];
3628
                }
3629
            }else{
3630
                if(y) src[i] -= src[i - stride];
3631
            }
3632
        }
3633
    }
3634
}
3635

    
3636
static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3637
    const int w= b->width;
3638
    int x,y;
3639

    
3640
//    START_TIMER
3641

    
3642
    DWTELEM * line;
3643
    DWTELEM * prev;
3644

    
3645
    if (start_y != 0)
3646
        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3647

    
3648
    for(y=start_y; y<end_y; y++){
3649
        prev = line;
3650
//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3651
        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3652
        for(x=0; x<w; x++){
3653
            if(x){
3654
                if(use_median){
3655
                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3656
                    else  line[x] += line[x - 1];
3657
                }else{
3658
                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3659
                    else  line[x] += line[x - 1];
3660
                }
3661
            }else{
3662
                if(y) line[x] += prev[x];
3663
            }
3664
        }
3665
    }
3666

    
3667
//    STOP_TIMER("correlate")
3668
}
3669

    
3670
static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3671
    const int w= b->width;
3672
    const int h= b->height;
3673
    int x,y;
3674

    
3675
    for(y=0; y<h; y++){
3676
        for(x=0; x<w; x++){
3677
            int i= x + y*stride;
3678

    
3679
            if(x){
3680
                if(use_median){
3681
                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3682
                    else  src[i] += src[i - 1];
3683
                }else{
3684
                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3685
                    else  src[i] += src[i - 1];
3686
                }
3687
            }else{
3688
                if(y) src[i] += src[i - stride];
3689
            }
3690
        }
3691
    }
3692
}
3693

    
3694
static void encode_header(SnowContext *s){
3695
    int plane_index, level, orientation;
3696
    uint8_t kstate[32];
3697

    
3698
    memset(kstate, MID_STATE, sizeof(kstate));
3699

    
3700
    put_rac(&s->c, kstate, s->keyframe);
3701
    if(s->keyframe || s->always_reset)
3702
        reset_contexts(s);
3703
    if(s->keyframe){
3704
        put_symbol(&s->c, s->header_state, s->version, 0);
3705
        put_rac(&s->c, s->header_state, s->always_reset);
3706
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3707
        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3708
        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3709
        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3710
        put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3711
        put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3712
        put_rac(&s->c, s->header_state, s->spatial_scalability);
3713
//        put_rac(&s->c, s->header_state, s->rate_scalability);
3714

    
3715
        for(plane_index=0; plane_index<2; plane_index++){
3716
            for(level=0; level<s->spatial_decomposition_count; level++){
3717
                for(orientation=level ? 1:0; orientation<4; orientation++){
3718
                    if(orientation==2) continue;
3719
                    put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3720
                }
3721
            }
3722
        }
3723
    }
3724
    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type, 0);
3725
    put_symbol(&s->c, s->header_state, s->qlog, 1);
3726
    put_symbol(&s->c, s->header_state, s->mv_scale, 0);
3727
    put_symbol(&s->c, s->header_state, s->qbias, 1);
3728
    put_symbol(&s->c, s->header_state, s->block_max_depth, 0);
3729
}
3730

    
3731
static int decode_header(SnowContext *s){
3732
    int plane_index, level, orientation;
3733
    uint8_t kstate[32];
3734

    
3735
    memset(kstate, MID_STATE, sizeof(kstate));
3736

    
3737
    s->keyframe= get_rac(&s->c, kstate);
3738
    if(s->keyframe || s->always_reset)
3739
        reset_contexts(s);
3740
    if(s->keyframe){
3741
        s->version= get_symbol(&s->c, s->header_state, 0);
3742
        if(s->version>0){
3743
            av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3744
            return -1;
3745
        }
3746
        s->always_reset= get_rac(&s->c, s->header_state);
3747
        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3748
        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3749
        s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3750
        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3751
        s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3752
        s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3753
        s->spatial_scalability= get_rac(&s->c, s->header_state);
3754
//        s->rate_scalability= get_rac(&s->c, s->header_state);
3755

    
3756
        for(plane_index=0; plane_index<3; plane_index++){
3757
            for(level=0; level<s->spatial_decomposition_count; level++){
3758
                for(orientation=level ? 1:0; orientation<4; orientation++){
3759
                    int q;
3760
                    if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3761
                    else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3762
                    else                    q= get_symbol(&s->c, s->header_state, 1);
3763
                    s->plane[plane_index].band[level][orientation].qlog= q;
3764
                }
3765
            }
3766
        }
3767
    }
3768

    
3769
    s->spatial_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3770
    if(s->spatial_decomposition_type > 2){
3771
        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3772
        return -1;
3773
    }
3774

    
3775
    s->qlog= get_symbol(&s->c, s->header_state, 1);
3776
    s->mv_scale= get_symbol(&s->c, s->header_state, 0);
3777
    s->qbias= get_symbol(&s->c, s->header_state, 1);
3778
    s->block_max_depth= get_symbol(&s->c, s->header_state, 0);
3779
    if(s->block_max_depth > 1){
3780
        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3781
        s->block_max_depth= 0;
3782
        return -1;
3783
    }
3784

    
3785
    return 0;
3786
}
3787

    
3788
static void init_qexp(){
3789
    int i;
3790
    double v=128;
3791

    
3792
    for(i=0; i<QROOT; i++){
3793
        qexp[i]= lrintf(v);
3794
        v *= pow(2, 1.0 / QROOT);
3795
    }
3796
}
3797

    
3798
static int common_init(AVCodecContext *avctx){
3799
    SnowContext *s = avctx->priv_data;
3800
    int width, height;
3801
    int level, orientation, plane_index, dec;
3802

    
3803
    s->avctx= avctx;
3804

    
3805
    dsputil_init(&s->dsp, avctx);
3806

    
3807
#define mcf(dx,dy)\
3808
    s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3809
    s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3810
        s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3811
    s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3812
    s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3813
        s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3814

    
3815
    mcf( 0, 0)
3816
    mcf( 4, 0)
3817
    mcf( 8, 0)
3818
    mcf(12, 0)
3819
    mcf( 0, 4)
3820
    mcf( 4, 4)
3821
    mcf( 8, 4)
3822
    mcf(12, 4)
3823
    mcf( 0, 8)
3824
    mcf( 4, 8)
3825
    mcf( 8, 8)
3826
    mcf(12, 8)
3827
    mcf( 0,12)
3828
    mcf( 4,12)
3829
    mcf( 8,12)
3830
    mcf(12,12)
3831

    
3832
#define mcfh(dx,dy)\
3833
    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3834
    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3835
        mc_block_hpel ## dx ## dy ## 16;\
3836
    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3837
    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3838
        mc_block_hpel ## dx ## dy ## 8;
3839

    
3840
    mcfh(0, 0)
3841
    mcfh(8, 0)
3842
    mcfh(0, 8)
3843
    mcfh(8, 8)
3844

    
3845
    if(!qexp[0])
3846
        init_qexp();
3847

    
3848
    dec= s->spatial_decomposition_count= 5;
3849
    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3850

    
3851
    s->chroma_h_shift= 1; //FIXME XXX
3852
    s->chroma_v_shift= 1;
3853

    
3854
//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3855

    
3856
    width= s->avctx->width;
3857
    height= s->avctx->height;
3858

    
3859
    s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3860

    
3861
    s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3862
    s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3863

    
3864
    for(plane_index=0; plane_index<3; plane_index++){
3865
        int w= s->avctx->width;
3866
        int h= s->avctx->height;
3867

    
3868
        if(plane_index){
3869
            w>>= s->chroma_h_shift;
3870
            h>>= s->chroma_v_shift;
3871
        }
3872
        s->plane[plane_index].width = w;
3873
        s->plane[plane_index].height= h;
3874
//av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3875
        for(level=s->spatial_decomposition_count-1; level>=0; level--){
3876
            for(orientation=level ? 1 : 0; orientation<4; orientation++){
3877
                SubBand *b= &s->plane[plane_index].band[level][orientation];
3878

    
3879
                b->buf= s->spatial_dwt_buffer;
3880
                b->level= level;
3881
                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3882
                b->width = (w + !(orientation&1))>>1;
3883
                b->height= (h + !(orientation>1))>>1;
3884

    
3885
                b->stride_line = 1 << (s->spatial_decomposition_count - level);
3886
                b->buf_x_offset = 0;
3887
                b->buf_y_offset = 0;
3888

    
3889
                if(orientation&1){
3890
                    b->buf += (w+1)>>1;
3891
                    b->buf_x_offset = (w+1)>>1;
3892
                }
3893
                if(orientation>1){
3894
                    b->buf += b->stride>>1;
3895
                    b->buf_y_offset = b->stride_line >> 1;
3896
                }
3897

    
3898
                if(level)
3899
                    b->parent= &s->plane[plane_index].band[level-1][orientation];
3900
                b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3901
            }
3902
            w= (w+1)>>1;
3903
            h= (h+1)>>1;
3904
        }
3905
    }
3906

    
3907
    reset_contexts(s);
3908
/*
3909
    width= s->width= avctx->width;
3910
    height= s->height= avctx->height;
3911

3912
    assert(width && height);
3913
*/
3914
    s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3915

    
3916
    return 0;
3917
}
3918

    
3919

    
3920
static void calculate_vissual_weight(SnowContext *s, Plane *p){
3921
    int width = p->width;
3922
    int height= p->height;
3923
    int level, orientation, x, y;
3924

    
3925
    for(level=0; level<s->spatial_decomposition_count; level++){
3926
        for(orientation=level ? 1 : 0; orientation<4; orientation++){
3927
            SubBand *b= &p->band[level][orientation];
3928
            DWTELEM *buf= b->buf;
3929
            int64_t error=0;
3930

    
3931
            memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3932
            buf[b->width/2 + b->height/2*b->stride]= 256*256;
3933
            ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3934
            for(y=0; y<height; y++){
3935
                for(x=0; x<width; x++){
3936
                    int64_t d= s->spatial_dwt_buffer[x + y*width];
3937
                    error += d*d;
3938
                }
3939
            }
3940

    
3941
            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3942
//            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3943
        }
3944
    }
3945
}
3946

    
3947
static int encode_init(AVCodecContext *avctx)
3948
{
3949
    SnowContext *s = avctx->priv_data;
3950
    int plane_index;
3951

    
3952
    if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3953
        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3954
               "use vstrict=-2 / -strict -2 to use it anyway\n");
3955
        return -1;
3956
    }
3957

    
3958
    common_init(avctx);
3959
    alloc_blocks(s);
3960

    
3961
    s->version=0;
3962

    
3963
    s->m.avctx   = avctx;
3964
    s->m.flags   = avctx->flags;
3965
    s->m.bit_rate= avctx->bit_rate;
3966

    
3967
    s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3968
    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3969
    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3970
    s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3971
    h263_encode_init(&s->m); //mv_penalty
3972

    
3973
    if(avctx->flags&CODEC_FLAG_PASS1){
3974
        if(!avctx->stats_out)
3975
            avctx->stats_out = av_mallocz(256);
3976
    }
3977
    if(avctx->flags&CODEC_FLAG_PASS2){
3978
        if(ff_rate_control_init(&s->m) < 0)
3979
            return -1;
3980
    }
3981

    
3982
    for(plane_index=0; plane_index<3; plane_index++){
3983
        calculate_vissual_weight(s, &s->plane[plane_index]);
3984
    }
3985

    
3986

    
3987
    avctx->coded_frame= &s->current_picture;
3988
    switch(avctx->pix_fmt){
3989
//    case PIX_FMT_YUV444P:
3990
//    case PIX_FMT_YUV422P:
3991
    case PIX_FMT_YUV420P:
3992
    case PIX_FMT_GRAY8:
3993
//    case PIX_FMT_YUV411P:
3994
//    case PIX_FMT_YUV410P:
3995
        s->colorspace_type= 0;
3996
        break;
3997
/*    case PIX_FMT_RGBA32:
3998
        s->colorspace= 1;
3999
        break;*/
4000
    default:
4001
        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
4002
        return -1;
4003
    }
4004
//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
4005
    s->chroma_h_shift= 1;
4006
    s->chroma_v_shift= 1;
4007

    
4008
    ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4009
    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4010

    
4011
    s->avctx->get_buffer(s->avctx, &s->input_picture);
4012

    
4013
    return 0;
4014
}
4015

    
4016
static int frame_start(SnowContext *s){
4017
   AVFrame tmp;
4018
   int w= s->avctx->width; //FIXME round up to x16 ?
4019
   int h= s->avctx->height;
4020

    
4021
    if(s->current_picture.data[0]){
4022
        draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4023
        draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4024
        draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4025
    }
4026

    
4027
    tmp= s->last_picture;
4028
    s->last_picture= s->current_picture;
4029
    s->current_picture= tmp;
4030

    
4031
    s->current_picture.reference= 1;
4032
    if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
4033
        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
4034
        return -1;
4035
    }
4036

    
4037
    return 0;
4038
}
4039

    
4040
static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
4041
    SnowContext *s = avctx->priv_data;
4042
    RangeCoder * const c= &s->c;
4043
    AVFrame *pict = data;
4044
    const int width= s->avctx->width;
4045
    const int height= s->avctx->height;
4046
    int level, orientation, plane_index, i, y;
4047

    
4048
    ff_init_range_encoder(c, buf, buf_size);
4049
    ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4050

    
4051
    for(i=0; i<3; i++){
4052
        int shift= !!i;
4053
        for(y=0; y<(height>>shift); y++)
4054
            memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
4055
                   &pict->data[i][y * pict->linesize[i]],
4056
                   width>>shift);
4057
    }
4058
    s->new_picture = *pict;
4059

    
4060
    if(avctx->flags&CODEC_FLAG_PASS2){
4061
        s->m.pict_type =
4062
        pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
4063
        s->keyframe= pict->pict_type==FF_I_TYPE;
4064
        s->m.picture_number= avctx->frame_number;
4065
        pict->quality= ff_rate_estimate_qscale(&s->m, 0);
4066
    }else{
4067
        s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
4068
        pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
4069
    }
4070

    
4071
    if(pict->quality){
4072
        s->qlog= rint(QROOT*log(pict->quality / (float)FF_QP2LAMBDA)/lo