Statistics
| Branch: | Revision:

ffmpeg / libavcodec / armv4l / dsputil_iwmmxt_rnd.h @ 6ad1fa5a

History | View | Annotate | Download (46.3 KB)

1
void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
2
{
3
    int stride = line_size;
4
    __asm__ __volatile__ (
5
        "and r12, %[pixels], #7 \n\t"
6
        "bic %[pixels], %[pixels], #7 \n\t"
7
        "tmcr wcgr1, r12 \n\t"
8
        "add r4, %[pixels], %[line_size] \n\t"
9
        "add r5, %[block], %[line_size] \n\t"
10
        "mov %[line_size], %[line_size], lsl #1 \n\t"
11
        "1: \n\t"
12
        "wldrd wr0, [%[pixels]] \n\t"
13
        "subs %[h], %[h], #2 \n\t"
14
        "wldrd wr1, [%[pixels], #8] \n\t"
15
        "add %[pixels], %[pixels], %[line_size] \n\t"
16
        "wldrd wr3, [r4] \n\t"
17
        "pld [%[pixels]] \n\t"
18
        "pld [%[pixels], #32] \n\t"
19
        "wldrd wr4, [r4, #8] \n\t"
20
        "add r4, r4, %[line_size] \n\t"
21
        "walignr1 wr8, wr0, wr1 \n\t"
22
        "pld [r4] \n\t"
23
        "pld [r4, #32] \n\t"
24
        "walignr1 wr10, wr3, wr4 \n\t"
25
        "wstrd wr8, [%[block]] \n\t"
26
        "add %[block], %[block], %[line_size] \n\t"
27
        "wstrd wr10, [r5] \n\t"
28
        "add r5, r5, %[line_size] \n\t"
29
        "bne 1b \n\t"
30
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
31
        :
32
        : "memory", "r4", "r5", "r12");
33
}
34

    
35
void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
36
{
37
    int stride = line_size;
38
    __asm__ __volatile__ (
39
        "and r12, %[pixels], #7 \n\t"
40
        "bic %[pixels], %[pixels], #7 \n\t"
41
        "tmcr wcgr1, r12 \n\t"
42
        "add r4, %[pixels], %[line_size] \n\t"
43
        "add r5, %[block], %[line_size] \n\t"
44
        "mov %[line_size], %[line_size], lsl #1 \n\t"
45
        "1: \n\t"
46
        "wldrd wr0, [%[pixels]] \n\t"
47
        "subs %[h], %[h], #2 \n\t"
48
        "wldrd wr1, [%[pixels], #8] \n\t"
49
        "add %[pixels], %[pixels], %[line_size] \n\t"
50
        "wldrd wr3, [r4] \n\t"
51
        "pld [%[pixels]] \n\t"
52
        "pld [%[pixels], #32] \n\t"
53
        "wldrd wr4, [r4, #8] \n\t"
54
        "add r4, r4, %[line_size] \n\t"
55
        "walignr1 wr8, wr0, wr1 \n\t"
56
        "wldrd wr0, [%[block]] \n\t"
57
        "wldrd wr2, [r5] \n\t"
58
        "pld [r4] \n\t"
59
        "pld [r4, #32] \n\t"
60
        "walignr1 wr10, wr3, wr4 \n\t"
61
        WAVG2B" wr8, wr8, wr0 \n\t"
62
        WAVG2B" wr10, wr10, wr2 \n\t"
63
        "wstrd wr8, [%[block]] \n\t"
64
        "add %[block], %[block], %[line_size] \n\t"
65
        "wstrd wr10, [r5] \n\t"
66
        "pld [%[block]] \n\t"
67
        "pld [%[block], #32] \n\t"
68
        "add r5, r5, %[line_size] \n\t"
69
        "pld [r5] \n\t"
70
        "pld [r5, #32] \n\t"
71
        "bne 1b \n\t"
72
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
73
        :
74
        : "memory", "r4", "r5", "r12");
75
}
76

    
77
void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
78
{
79
    int stride = line_size;
80
    __asm__ __volatile__ (
81
        "and r12, %[pixels], #7 \n\t"
82
        "bic %[pixels], %[pixels], #7 \n\t"
83
        "tmcr wcgr1, r12 \n\t"
84
        "add r4, %[pixels], %[line_size] \n\t"
85
        "add r5, %[block], %[line_size] \n\t"
86
        "mov %[line_size], %[line_size], lsl #1 \n\t"
87
        "1: \n\t"
88
        "wldrd wr0, [%[pixels]] \n\t"
89
        "wldrd wr1, [%[pixels], #8] \n\t"
90
        "subs %[h], %[h], #2 \n\t"
91
        "wldrd wr2, [%[pixels], #16] \n\t"
92
        "add %[pixels], %[pixels], %[line_size] \n\t"
93
        "wldrd wr3, [r4] \n\t"
94
        "pld [%[pixels]] \n\t"
95
        "pld [%[pixels], #32] \n\t"
96
        "walignr1 wr8, wr0, wr1 \n\t"
97
        "wldrd wr4, [r4, #8] \n\t"
98
        "walignr1 wr9, wr1, wr2 \n\t"
99
        "wldrd wr5, [r4, #16] \n\t"
100
        "add r4, r4, %[line_size] \n\t"
101
        "pld [r4] \n\t"
102
        "pld [r4, #32] \n\t"
103
        "walignr1 wr10, wr3, wr4 \n\t"
104
        "wstrd wr8, [%[block]] \n\t"
105
        "walignr1 wr11, wr4, wr5 \n\t"
106
        "wstrd wr9, [%[block], #8] \n\t"
107
        "add %[block], %[block], %[line_size] \n\t"
108
        "wstrd wr10, [r5] \n\t"
109
        "wstrd wr11, [r5, #8] \n\t"
110
        "add r5, r5, %[line_size] \n\t"
111
        "bne 1b \n\t"
112
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
113
        :
114
        : "memory", "r4", "r5", "r12");
115
}
116

    
117
void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
118
{
119
    int stride = line_size;
120
    __asm__ __volatile__ (
121
        "pld [%[pixels]]                \n\t"
122
        "pld [%[pixels], #32]           \n\t"
123
        "pld [%[block]]                 \n\t"
124
        "pld [%[block], #32]            \n\t"
125
        "and r12, %[pixels], #7         \n\t"
126
        "bic %[pixels], %[pixels], #7   \n\t"
127
        "tmcr wcgr1, r12                \n\t"
128
        "add r4, %[pixels], %[line_size]\n\t"
129
        "add r5, %[block], %[line_size] \n\t"
130
        "mov %[line_size], %[line_size], lsl #1 \n\t"
131
        "1:                             \n\t"
132
        "wldrd wr0, [%[pixels]]         \n\t"
133
        "wldrd wr1, [%[pixels], #8]     \n\t"
134
        "subs %[h], %[h], #2            \n\t"
135
        "wldrd wr2, [%[pixels], #16]    \n\t"
136
        "add %[pixels], %[pixels], %[line_size] \n\t"
137
        "wldrd wr3, [r4]                \n\t"
138
        "pld [%[pixels]]                \n\t"
139
        "pld [%[pixels], #32]           \n\t"
140
        "walignr1 wr8, wr0, wr1         \n\t"
141
        "wldrd wr4, [r4, #8]            \n\t"
142
        "walignr1 wr9, wr1, wr2         \n\t"
143
        "wldrd wr5, [r4, #16]           \n\t"
144
        "add r4, r4, %[line_size]       \n\t"
145
        "wldrd wr0, [%[block]]          \n\t"
146
        "pld [r4]                       \n\t"
147
        "wldrd wr1, [%[block], #8]      \n\t"
148
        "pld [r4, #32]                  \n\t"
149
        "wldrd wr2, [r5]                \n\t"
150
        "walignr1 wr10, wr3, wr4        \n\t"
151
        "wldrd wr3, [r5, #8]            \n\t"
152
        WAVG2B" wr8, wr8, wr0           \n\t"
153
        WAVG2B" wr9, wr9, wr1           \n\t"
154
        WAVG2B" wr10, wr10, wr2         \n\t"
155
        "wstrd wr8, [%[block]]          \n\t"
156
        "walignr1 wr11, wr4, wr5        \n\t"
157
        WAVG2B" wr11, wr11, wr3         \n\t"
158
        "wstrd wr9, [%[block], #8]      \n\t"
159
        "add %[block], %[block], %[line_size] \n\t"
160
        "wstrd wr10, [r5]               \n\t"
161
        "pld [%[block]]                 \n\t"
162
        "pld [%[block], #32]            \n\t"
163
        "wstrd wr11, [r5, #8]           \n\t"
164
        "add r5, r5, %[line_size]       \n\t"
165
        "pld [r5]                       \n\t"
166
        "pld [r5, #32]                  \n\t"
167
        "bne 1b \n\t"
168
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
169
        :
170
        : "memory", "r4", "r5", "r12");
171
}
172

    
173
void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
174
{
175
    int stride = line_size;
176
    // [wr0 wr1 wr2 wr3] for previous line
177
    // [wr4 wr5 wr6 wr7] for current line
178
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
179
    __asm__ __volatile__(
180
        "pld [%[pixels]]                \n\t"
181
        "pld [%[pixels], #32]           \n\t"
182
        "and r12, %[pixels], #7         \n\t"
183
        "bic %[pixels], %[pixels], #7   \n\t"
184
        "tmcr wcgr1, r12                \n\t"
185
        "add r12, r12, #1               \n\t"
186
        "add r4, %[pixels], %[line_size]\n\t"
187
        "tmcr wcgr2, r12                \n\t"
188
        "add r5, %[block], %[line_size] \n\t"
189
        "mov %[line_size], %[line_size], lsl #1 \n\t"
190

    
191
        "1:                             \n\t"
192
        "wldrd wr10, [%[pixels]]        \n\t"
193
        "cmp r12, #8                    \n\t"
194
        "wldrd wr11, [%[pixels], #8]    \n\t"
195
        "add %[pixels], %[pixels], %[line_size] \n\t"
196
        "wldrd wr13, [r4]               \n\t"
197
        "pld [%[pixels]]                \n\t"
198
        "wldrd wr14, [r4, #8]           \n\t"
199
        "pld [%[pixels], #32]           \n\t"
200
        "add r4, r4, %[line_size]       \n\t"
201
        "walignr1 wr0, wr10, wr11       \n\t"
202
        "pld [r4]                       \n\t"
203
        "pld [r4, #32]                  \n\t"
204
        "walignr1 wr2, wr13, wr14       \n\t"
205
        "wmoveq wr4, wr11               \n\t"
206
        "wmoveq wr6, wr14               \n\t"
207
        "walignr2ne wr4, wr10, wr11     \n\t"
208
        "walignr2ne wr6, wr13, wr14     \n\t"
209
        WAVG2B" wr0, wr0, wr4           \n\t"
210
        WAVG2B" wr2, wr2, wr6           \n\t"
211
        "wstrd wr0, [%[block]]          \n\t"
212
        "subs %[h], %[h], #2            \n\t"
213
        "wstrd wr2, [r5]                \n\t"
214
        "add %[block], %[block], %[line_size]   \n\t"
215
        "add r5, r5, %[line_size]       \n\t"
216
        "bne 1b                         \n\t"
217
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
218
        :
219
        : "r4", "r5", "r12", "memory");
220
}
221

    
222
void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
223
{
224
    int stride = line_size;
225
    // [wr0 wr1 wr2 wr3] for previous line
226
    // [wr4 wr5 wr6 wr7] for current line
227
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
228
    __asm__ __volatile__(
229
        "pld [%[pixels]]                \n\t"
230
        "pld [%[pixels], #32]           \n\t"
231
        "and r12, %[pixels], #7         \n\t"
232
        "bic %[pixels], %[pixels], #7   \n\t"
233
        "tmcr wcgr1, r12                \n\t"
234
        "add r12, r12, #1               \n\t"
235
        "add r4, %[pixels], %[line_size]\n\t"
236
        "tmcr wcgr2, r12                \n\t"
237
        "add r5, %[block], %[line_size] \n\t"
238
        "mov %[line_size], %[line_size], lsl #1 \n\t"
239

    
240
        "1:                             \n\t"
241
        "wldrd wr10, [%[pixels]]        \n\t"
242
        "cmp r12, #8                    \n\t"
243
        "wldrd wr11, [%[pixels], #8]    \n\t"
244
        "wldrd wr12, [%[pixels], #16]   \n\t"
245
        "add %[pixels], %[pixels], %[line_size] \n\t"
246
        "wldrd wr13, [r4]               \n\t"
247
        "pld [%[pixels]]                \n\t"
248
        "wldrd wr14, [r4, #8]           \n\t"
249
        "pld [%[pixels], #32]           \n\t"
250
        "wldrd wr15, [r4, #16]          \n\t"
251
        "add r4, r4, %[line_size]       \n\t"
252
        "walignr1 wr0, wr10, wr11       \n\t"
253
        "pld [r4]                       \n\t"
254
        "pld [r4, #32]                  \n\t"
255
        "walignr1 wr1, wr11, wr12       \n\t"
256
        "walignr1 wr2, wr13, wr14       \n\t"
257
        "walignr1 wr3, wr14, wr15       \n\t"
258
        "wmoveq wr4, wr11               \n\t"
259
        "wmoveq wr5, wr12               \n\t"
260
        "wmoveq wr6, wr14               \n\t"
261
        "wmoveq wr7, wr15               \n\t"
262
        "walignr2ne wr4, wr10, wr11     \n\t"
263
        "walignr2ne wr5, wr11, wr12     \n\t"
264
        "walignr2ne wr6, wr13, wr14     \n\t"
265
        "walignr2ne wr7, wr14, wr15     \n\t"
266
        WAVG2B" wr0, wr0, wr4           \n\t"
267
        WAVG2B" wr1, wr1, wr5           \n\t"
268
        "wstrd wr0, [%[block]]          \n\t"
269
        WAVG2B" wr2, wr2, wr6           \n\t"
270
        "wstrd wr1, [%[block], #8]      \n\t"
271
        WAVG2B" wr3, wr3, wr7           \n\t"
272
        "add %[block], %[block], %[line_size]   \n\t"
273
        "wstrd wr2, [r5]                \n\t"
274
        "subs %[h], %[h], #2            \n\t"
275
        "wstrd wr3, [r5, #8]            \n\t"
276
        "add r5, r5, %[line_size]       \n\t"
277
        "bne 1b                         \n\t"
278
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
279
        :
280
        : "r4", "r5", "r12", "memory");
281
}
282

    
283
void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
284
{
285
    int stride = line_size;
286
    // [wr0 wr1 wr2 wr3] for previous line
287
    // [wr4 wr5 wr6 wr7] for current line
288
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
289
    __asm__ __volatile__(
290
        "pld [%[pixels]]                \n\t"
291
        "pld [%[pixels], #32]           \n\t"
292
        "pld [%[block]]                 \n\t"
293
        "pld [%[block], #32]            \n\t"
294
        "and r12, %[pixels], #7         \n\t"
295
        "bic %[pixels], %[pixels], #7   \n\t"
296
        "tmcr wcgr1, r12                \n\t"
297
        "add r12, r12, #1               \n\t"
298
        "add r4, %[pixels], %[line_size]\n\t"
299
        "tmcr wcgr2, r12                \n\t"
300
        "add r5, %[block], %[line_size] \n\t"
301
        "mov %[line_size], %[line_size], lsl #1 \n\t"
302
        "pld [r5]                       \n\t"
303
        "pld [r5, #32]                  \n\t"
304

    
305
        "1:                             \n\t"
306
        "wldrd wr10, [%[pixels]]        \n\t"
307
        "cmp r12, #8                    \n\t"
308
        "wldrd wr11, [%[pixels], #8]    \n\t"
309
        "add %[pixels], %[pixels], %[line_size] \n\t"
310
        "wldrd wr13, [r4]               \n\t"
311
        "pld [%[pixels]]                \n\t"
312
        "wldrd wr14, [r4, #8]           \n\t"
313
        "pld [%[pixels], #32]           \n\t"
314
        "add r4, r4, %[line_size]       \n\t"
315
        "walignr1 wr0, wr10, wr11       \n\t"
316
        "pld [r4]                       \n\t"
317
        "pld [r4, #32]                  \n\t"
318
        "walignr1 wr2, wr13, wr14       \n\t"
319
        "wmoveq wr4, wr11               \n\t"
320
        "wmoveq wr6, wr14               \n\t"
321
        "walignr2ne wr4, wr10, wr11     \n\t"
322
        "wldrd wr10, [%[block]]         \n\t"
323
        "walignr2ne wr6, wr13, wr14     \n\t"
324
        "wldrd wr12, [r5]               \n\t"
325
        WAVG2B" wr0, wr0, wr4           \n\t"
326
        WAVG2B" wr2, wr2, wr6           \n\t"
327
        WAVG2B" wr0, wr0, wr10          \n\t"
328
        WAVG2B" wr2, wr2, wr12          \n\t"
329
        "wstrd wr0, [%[block]]          \n\t"
330
        "subs %[h], %[h], #2            \n\t"
331
        "wstrd wr2, [r5]                \n\t"
332
        "add %[block], %[block], %[line_size]   \n\t"
333
        "add r5, r5, %[line_size]       \n\t"
334
        "pld [%[block]]                 \n\t"
335
        "pld [%[block], #32]            \n\t"
336
        "pld [r5]                       \n\t"
337
        "pld [r5, #32]                  \n\t"
338
        "bne 1b                         \n\t"
339
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
340
        :
341
        : "r4", "r5", "r12", "memory");
342
}
343

    
344
void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
345
{
346
    int stride = line_size;
347
    // [wr0 wr1 wr2 wr3] for previous line
348
    // [wr4 wr5 wr6 wr7] for current line
349
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
350
    __asm__ __volatile__(
351
        "pld [%[pixels]]                \n\t"
352
        "pld [%[pixels], #32]           \n\t"
353
        "pld [%[block]]                 \n\t"
354
        "pld [%[block], #32]            \n\t"
355
        "and r12, %[pixels], #7         \n\t"
356
        "bic %[pixels], %[pixels], #7   \n\t"
357
        "tmcr wcgr1, r12                \n\t"
358
        "add r12, r12, #1               \n\t"
359
        "add r4, %[pixels], %[line_size]\n\t"
360
        "tmcr wcgr2, r12                \n\t"
361
        "add r5, %[block], %[line_size] \n\t"
362
        "mov %[line_size], %[line_size], lsl #1 \n\t"
363
        "pld [r5]                       \n\t"
364
        "pld [r5, #32]                  \n\t"
365

    
366
        "1:                             \n\t"
367
        "wldrd wr10, [%[pixels]]        \n\t"
368
        "cmp r12, #8                    \n\t"
369
        "wldrd wr11, [%[pixels], #8]    \n\t"
370
        "wldrd wr12, [%[pixels], #16]   \n\t"
371
        "add %[pixels], %[pixels], %[line_size] \n\t"
372
        "wldrd wr13, [r4]               \n\t"
373
        "pld [%[pixels]]                \n\t"
374
        "wldrd wr14, [r4, #8]           \n\t"
375
        "pld [%[pixels], #32]           \n\t"
376
        "wldrd wr15, [r4, #16]          \n\t"
377
        "add r4, r4, %[line_size]       \n\t"
378
        "walignr1 wr0, wr10, wr11       \n\t"
379
        "pld [r4]                       \n\t"
380
        "pld [r4, #32]                  \n\t"
381
        "walignr1 wr1, wr11, wr12       \n\t"
382
        "walignr1 wr2, wr13, wr14       \n\t"
383
        "walignr1 wr3, wr14, wr15       \n\t"
384
        "wmoveq wr4, wr11               \n\t"
385
        "wmoveq wr5, wr12               \n\t"
386
        "wmoveq wr6, wr14               \n\t"
387
        "wmoveq wr7, wr15               \n\t"
388
        "walignr2ne wr4, wr10, wr11     \n\t"
389
        "walignr2ne wr5, wr11, wr12     \n\t"
390
        "walignr2ne wr6, wr13, wr14     \n\t"
391
        "walignr2ne wr7, wr14, wr15     \n\t"
392
        "wldrd wr10, [%[block]]         \n\t"
393
        WAVG2B" wr0, wr0, wr4           \n\t"
394
        "wldrd wr11, [%[block], #8]     \n\t"
395
        WAVG2B" wr1, wr1, wr5           \n\t"
396
        "wldrd wr12, [r5]               \n\t"
397
        WAVG2B" wr2, wr2, wr6           \n\t"
398
        "wldrd wr13, [r5, #8]           \n\t"
399
        WAVG2B" wr3, wr3, wr7           \n\t"
400
        WAVG2B" wr0, wr0, wr10          \n\t"
401
        WAVG2B" wr1, wr1, wr11          \n\t"
402
        WAVG2B" wr2, wr2, wr12          \n\t"
403
        WAVG2B" wr3, wr3, wr13          \n\t"
404
        "wstrd wr0, [%[block]]          \n\t"
405
        "subs %[h], %[h], #2            \n\t"
406
        "wstrd wr1, [%[block], #8]      \n\t"
407
        "add %[block], %[block], %[line_size]   \n\t"
408
        "wstrd wr2, [r5]                \n\t"
409
        "pld [%[block]]                 \n\t"
410
        "wstrd wr3, [r5, #8]            \n\t"
411
        "add r5, r5, %[line_size]       \n\t"
412
        "pld [%[block], #32]            \n\t"
413
        "pld [r5]                       \n\t"
414
        "pld [r5, #32]                  \n\t"
415
        "bne 1b                         \n\t"
416
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
417
        :
418
        :"r4", "r5", "r12", "memory");
419
}
420

    
421
void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
422
{
423
    int stride = line_size;
424
    // [wr0 wr1 wr2 wr3] for previous line
425
    // [wr4 wr5 wr6 wr7] for current line
426
    __asm__ __volatile__(
427
        "pld            [%[pixels]]                             \n\t"
428
        "pld            [%[pixels], #32]                        \n\t"
429
        "and            r12, %[pixels], #7                      \n\t"
430
        "tmcr           wcgr1, r12                              \n\t"
431
        "bic            %[pixels], %[pixels], #7                \n\t"
432

    
433
        "wldrd          wr10, [%[pixels]]                       \n\t"
434
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
435
        "pld            [%[block]]                              \n\t"
436
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
437
        "walignr1       wr0, wr10, wr11                         \n\t"
438
        "pld            [%[pixels]]                             \n\t"
439
        "pld            [%[pixels], #32]                        \n\t"
440

    
441
      "1:                                                       \n\t"
442
        "wldrd          wr10, [%[pixels]]                       \n\t"
443
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
444
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
445
        "pld            [%[pixels]]                             \n\t"
446
        "pld            [%[pixels], #32]                        \n\t"
447
        "walignr1       wr4, wr10, wr11                         \n\t"
448
        "wldrd          wr10, [%[block]]                        \n\t"
449
         WAVG2B"        wr8, wr0, wr4                           \n\t"
450
         WAVG2B"        wr8, wr8, wr10                          \n\t"
451
        "wstrd          wr8, [%[block]]                         \n\t"
452
        "add            %[block], %[block], %[line_size]        \n\t"
453

    
454
        "wldrd          wr10, [%[pixels]]                       \n\t"
455
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
456
        "pld            [%[block]]                              \n\t"
457
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
458
        "pld            [%[pixels]]                             \n\t"
459
        "pld            [%[pixels], #32]                        \n\t"
460
        "walignr1       wr0, wr10, wr11                         \n\t"
461
        "wldrd          wr10, [%[block]]                        \n\t"
462
         WAVG2B"        wr8, wr0, wr4                           \n\t"
463
         WAVG2B"        wr8, wr8, wr10                          \n\t"
464
        "wstrd          wr8, [%[block]]                         \n\t"
465
        "add            %[block], %[block], %[line_size]        \n\t"
466

    
467
        "subs           %[h], %[h], #2                          \n\t"
468
        "pld            [%[block]]                              \n\t"
469
        "bne            1b                                      \n\t"
470
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
471
        :
472
        : "cc", "memory", "r12");
473
}
474

    
475
void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
476
{
477
    int stride = line_size;
478
    // [wr0 wr1 wr2 wr3] for previous line
479
    // [wr4 wr5 wr6 wr7] for current line
480
    __asm__ __volatile__(
481
        "pld [%[pixels]]                \n\t"
482
        "pld [%[pixels], #32]           \n\t"
483
        "and r12, %[pixels], #7         \n\t"
484
        "tmcr wcgr1, r12                \n\t"
485
        "bic %[pixels], %[pixels], #7   \n\t"
486

    
487
        "wldrd wr10, [%[pixels]]        \n\t"
488
        "wldrd wr11, [%[pixels], #8]    \n\t"
489
        "wldrd wr12, [%[pixels], #16]   \n\t"
490
        "add %[pixels], %[pixels], %[line_size] \n\t"
491
        "pld [%[pixels]]                \n\t"
492
        "pld [%[pixels], #32]           \n\t"
493
        "walignr1 wr0, wr10, wr11       \n\t"
494
        "walignr1 wr1, wr11, wr12       \n\t"
495

    
496
        "1:                             \n\t"
497
        "wldrd wr10, [%[pixels]]        \n\t"
498
        "wldrd wr11, [%[pixels], #8]    \n\t"
499
        "wldrd wr12, [%[pixels], #16]   \n\t"
500
        "add %[pixels], %[pixels], %[line_size] \n\t"
501
        "pld [%[pixels]]                \n\t"
502
        "pld [%[pixels], #32]           \n\t"
503
        "walignr1 wr4, wr10, wr11       \n\t"
504
        "walignr1 wr5, wr11, wr12       \n\t"
505
        WAVG2B" wr8, wr0, wr4           \n\t"
506
        WAVG2B" wr9, wr1, wr5           \n\t"
507
        "wstrd wr8, [%[block]]          \n\t"
508
        "wstrd wr9, [%[block], #8]      \n\t"
509
        "add %[block], %[block], %[line_size]   \n\t"
510

    
511
        "wldrd wr10, [%[pixels]]        \n\t"
512
        "wldrd wr11, [%[pixels], #8]    \n\t"
513
        "wldrd wr12, [%[pixels], #16]   \n\t"
514
        "add %[pixels], %[pixels], %[line_size] \n\t"
515
        "pld [%[pixels]]                \n\t"
516
        "pld [%[pixels], #32]           \n\t"
517
        "walignr1 wr0, wr10, wr11       \n\t"
518
        "walignr1 wr1, wr11, wr12       \n\t"
519
        WAVG2B" wr8, wr0, wr4           \n\t"
520
        WAVG2B" wr9, wr1, wr5           \n\t"
521
        "wstrd wr8, [%[block]]          \n\t"
522
        "wstrd wr9, [%[block], #8]      \n\t"
523
        "add %[block], %[block], %[line_size]   \n\t"
524

    
525
        "subs %[h], %[h], #2            \n\t"
526
        "bne 1b                         \n\t"
527
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
528
        :
529
        : "r4", "r5", "r12", "memory");
530
}
531

    
532
void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
533
{
534
    int stride = line_size;
535
    // [wr0 wr1 wr2 wr3] for previous line
536
    // [wr4 wr5 wr6 wr7] for current line
537
    __asm__ __volatile__(
538
        "pld [%[pixels]]                \n\t"
539
        "pld [%[pixels], #32]           \n\t"
540
        "and r12, %[pixels], #7         \n\t"
541
        "tmcr wcgr1, r12                \n\t"
542
        "bic %[pixels], %[pixels], #7   \n\t"
543

    
544
        "wldrd wr10, [%[pixels]]        \n\t"
545
        "wldrd wr11, [%[pixels], #8]    \n\t"
546
        "pld [%[block]]                 \n\t"
547
        "wldrd wr12, [%[pixels], #16]   \n\t"
548
        "add %[pixels], %[pixels], %[line_size] \n\t"
549
        "pld [%[pixels]]                \n\t"
550
        "pld [%[pixels], #32]           \n\t"
551
        "walignr1 wr0, wr10, wr11       \n\t"
552
        "walignr1 wr1, wr11, wr12       \n\t"
553

    
554
        "1:                             \n\t"
555
        "wldrd wr10, [%[pixels]]        \n\t"
556
        "wldrd wr11, [%[pixels], #8]    \n\t"
557
        "wldrd wr12, [%[pixels], #16]   \n\t"
558
        "add %[pixels], %[pixels], %[line_size] \n\t"
559
        "pld [%[pixels]]                \n\t"
560
        "pld [%[pixels], #32]           \n\t"
561
        "walignr1 wr4, wr10, wr11       \n\t"
562
        "walignr1 wr5, wr11, wr12       \n\t"
563
        "wldrd wr10, [%[block]]         \n\t"
564
        "wldrd wr11, [%[block], #8]     \n\t"
565
        WAVG2B" wr8, wr0, wr4           \n\t"
566
        WAVG2B" wr9, wr1, wr5           \n\t"
567
        WAVG2B" wr8, wr8, wr10          \n\t"
568
        WAVG2B" wr9, wr9, wr11          \n\t"
569
        "wstrd wr8, [%[block]]          \n\t"
570
        "wstrd wr9, [%[block], #8]      \n\t"
571
        "add %[block], %[block], %[line_size]   \n\t"
572

    
573
        "wldrd wr10, [%[pixels]]        \n\t"
574
        "wldrd wr11, [%[pixels], #8]    \n\t"
575
        "pld [%[block]]                 \n\t"
576
        "wldrd wr12, [%[pixels], #16]   \n\t"
577
        "add %[pixels], %[pixels], %[line_size] \n\t"
578
        "pld [%[pixels]]                \n\t"
579
        "pld [%[pixels], #32]           \n\t"
580
        "walignr1 wr0, wr10, wr11       \n\t"
581
        "walignr1 wr1, wr11, wr12       \n\t"
582
        "wldrd wr10, [%[block]]         \n\t"
583
        "wldrd wr11, [%[block], #8]     \n\t"
584
        WAVG2B" wr8, wr0, wr4           \n\t"
585
        WAVG2B" wr9, wr1, wr5           \n\t"
586
        WAVG2B" wr8, wr8, wr10          \n\t"
587
        WAVG2B" wr9, wr9, wr11          \n\t"
588
        "wstrd wr8, [%[block]]          \n\t"
589
        "wstrd wr9, [%[block], #8]      \n\t"
590
        "add %[block], %[block], %[line_size]   \n\t"
591

    
592
        "subs %[h], %[h], #2            \n\t"
593
        "pld [%[block]]                 \n\t"
594
        "bne 1b                         \n\t"
595
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
596
        :
597
        : "r4", "r5", "r12", "memory");
598
}
599

    
600
void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
601
{
602
    // [wr0 wr1 wr2 wr3] for previous line
603
    // [wr4 wr5 wr6 wr7] for current line
604
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
605
    __asm__ __volatile__(
606
        "pld [%[pixels]]                \n\t"
607
        "mov r12, #2                    \n\t"
608
        "pld [%[pixels], #32]           \n\t"
609
        "tmcr wcgr0, r12                \n\t" /* for shift value */
610
        "and r12, %[pixels], #7         \n\t"
611
        "bic %[pixels], %[pixels], #7   \n\t"
612
        "tmcr wcgr1, r12                \n\t"
613

    
614
        // [wr0 wr1 wr2 wr3] <= *
615
        // [wr4 wr5 wr6 wr7]
616
        "wldrd wr12, [%[pixels]]        \n\t"
617
        "add r12, r12, #1               \n\t"
618
        "wldrd wr13, [%[pixels], #8]    \n\t"
619
        "tmcr wcgr2, r12                \n\t"
620
        "add %[pixels], %[pixels], %[line_size] \n\t"
621
        "cmp r12, #8                    \n\t"
622
        "pld [%[pixels]]                \n\t"
623
        "pld [%[pixels], #32]           \n\t"
624
        "walignr1 wr2, wr12, wr13       \n\t"
625
        "wmoveq wr10, wr13              \n\t"
626
        "walignr2ne wr10, wr12, wr13    \n\t"
627
        "wunpckelub wr0, wr2            \n\t"
628
        "wunpckehub wr1, wr2            \n\t"
629
        "wunpckelub wr8, wr10           \n\t"
630
        "wunpckehub wr9, wr10           \n\t"
631
        "waddhus wr0, wr0, wr8          \n\t"
632
        "waddhus wr1, wr1, wr9          \n\t"
633

    
634
        "1:                             \n\t"
635
        // [wr0 wr1 wr2 wr3]
636
        // [wr4 wr5 wr6 wr7] <= *
637
        "wldrd wr12, [%[pixels]]        \n\t"
638
        "cmp r12, #8                    \n\t"
639
        "wldrd wr13, [%[pixels], #8]    \n\t"
640
        "add %[pixels], %[pixels], %[line_size] \n\t"
641
        "walignr1 wr6, wr12, wr13       \n\t"
642
        "pld [%[pixels]]                \n\t"
643
        "pld [%[pixels], #32]           \n\t"
644
        "wmoveq wr10, wr13              \n\t"
645
        "walignr2ne wr10, wr12, wr13    \n\t"
646
        "wunpckelub wr4, wr6            \n\t"
647
        "wunpckehub wr5, wr6            \n\t"
648
        "wunpckelub wr8, wr10           \n\t"
649
        "wunpckehub wr9, wr10           \n\t"
650
        "waddhus wr4, wr4, wr8          \n\t"
651
        "waddhus wr5, wr5, wr9          \n\t"
652
        "waddhus wr8, wr0, wr4          \n\t"
653
        "waddhus wr9, wr1, wr5          \n\t"
654
        "waddhus wr8, wr8, wr15         \n\t"
655
        "waddhus wr9, wr9, wr15         \n\t"
656
        "wsrlhg wr8, wr8, wcgr0         \n\t"
657
        "wsrlhg wr9, wr9, wcgr0         \n\t"
658
        "wpackhus wr8, wr8, wr9         \n\t"
659
        "wstrd wr8, [%[block]]          \n\t"
660
        "add %[block], %[block], %[line_size]   \n\t"
661

    
662
        // [wr0 wr1 wr2 wr3] <= *
663
        // [wr4 wr5 wr6 wr7]
664
        "wldrd wr12, [%[pixels]]        \n\t"
665
        "wldrd wr13, [%[pixels], #8]    \n\t"
666
        "add %[pixels], %[pixels], %[line_size] \n\t"
667
        "walignr1 wr2, wr12, wr13       \n\t"
668
        "pld [%[pixels]]                \n\t"
669
        "pld [%[pixels], #32]           \n\t"
670
        "wmoveq wr10, wr13              \n\t"
671
        "walignr2ne wr10, wr12, wr13    \n\t"
672
        "wunpckelub wr0, wr2            \n\t"
673
        "wunpckehub wr1, wr2            \n\t"
674
        "wunpckelub wr8, wr10           \n\t"
675
        "wunpckehub wr9, wr10           \n\t"
676
        "waddhus wr0, wr0, wr8          \n\t"
677
        "waddhus wr1, wr1, wr9          \n\t"
678
        "waddhus wr8, wr0, wr4          \n\t"
679
        "waddhus wr9, wr1, wr5          \n\t"
680
        "waddhus wr8, wr8, wr15         \n\t"
681
        "waddhus wr9, wr9, wr15         \n\t"
682
        "wsrlhg wr8, wr8, wcgr0         \n\t"
683
        "wsrlhg wr9, wr9, wcgr0         \n\t"
684
        "wpackhus wr8, wr8, wr9         \n\t"
685
        "subs %[h], %[h], #2            \n\t"
686
        "wstrd wr8, [%[block]]          \n\t"
687
        "add %[block], %[block], %[line_size]   \n\t"
688
        "bne 1b                         \n\t"
689
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
690
        : [line_size]"r"(line_size)
691
        : "r12", "memory");
692
}
693

    
694
void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
695
{
696
    // [wr0 wr1 wr2 wr3] for previous line
697
    // [wr4 wr5 wr6 wr7] for current line
698
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
699
    __asm__ __volatile__(
700
        "pld [%[pixels]]                \n\t"
701
        "mov r12, #2                    \n\t"
702
        "pld [%[pixels], #32]           \n\t"
703
        "tmcr wcgr0, r12                \n\t" /* for shift value */
704
        /* alignment */
705
        "and r12, %[pixels], #7         \n\t"
706
        "bic %[pixels], %[pixels], #7   \n\t"
707
        "tmcr wcgr1, r12                \n\t"
708
        "add r12, r12, #1               \n\t"
709
        "tmcr wcgr2, r12                \n\t"
710

    
711
        // [wr0 wr1 wr2 wr3] <= *
712
        // [wr4 wr5 wr6 wr7]
713
        "wldrd wr12, [%[pixels]]        \n\t"
714
        "cmp r12, #8                    \n\t"
715
        "wldrd wr13, [%[pixels], #8]    \n\t"
716
        "wldrd wr14, [%[pixels], #16]   \n\t"
717
        "add %[pixels], %[pixels], %[line_size] \n\t"
718
        "pld [%[pixels]]                \n\t"
719
        "walignr1 wr2, wr12, wr13       \n\t"
720
        "pld [%[pixels], #32]           \n\t"
721
        "walignr1 wr3, wr13, wr14       \n\t"
722
        "wmoveq wr10, wr13              \n\t"
723
        "wmoveq wr11, wr14              \n\t"
724
        "walignr2ne wr10, wr12, wr13    \n\t"
725
        "walignr2ne wr11, wr13, wr14    \n\t"
726
        "wunpckelub wr0, wr2            \n\t"
727
        "wunpckehub wr1, wr2            \n\t"
728
        "wunpckelub wr2, wr3            \n\t"
729
        "wunpckehub wr3, wr3            \n\t"
730
        "wunpckelub wr8, wr10           \n\t"
731
        "wunpckehub wr9, wr10           \n\t"
732
        "wunpckelub wr10, wr11          \n\t"
733
        "wunpckehub wr11, wr11          \n\t"
734
        "waddhus wr0, wr0, wr8          \n\t"
735
        "waddhus wr1, wr1, wr9          \n\t"
736
        "waddhus wr2, wr2, wr10         \n\t"
737
        "waddhus wr3, wr3, wr11         \n\t"
738

    
739
        "1:                             \n\t"
740
        // [wr0 wr1 wr2 wr3]
741
        // [wr4 wr5 wr6 wr7] <= *
742
        "wldrd wr12, [%[pixels]]        \n\t"
743
        "cmp r12, #8                    \n\t"
744
        "wldrd wr13, [%[pixels], #8]    \n\t"
745
        "wldrd wr14, [%[pixels], #16]   \n\t"
746
        "add %[pixels], %[pixels], %[line_size] \n\t"
747
        "walignr1 wr6, wr12, wr13       \n\t"
748
        "pld [%[pixels]]                \n\t"
749
        "pld [%[pixels], #32]           \n\t"
750
        "walignr1 wr7, wr13, wr14       \n\t"
751
        "wmoveq wr10, wr13              \n\t"
752
        "wmoveq wr11, wr14              \n\t"
753
        "walignr2ne wr10, wr12, wr13    \n\t"
754
        "walignr2ne wr11, wr13, wr14    \n\t"
755
        "wunpckelub wr4, wr6            \n\t"
756
        "wunpckehub wr5, wr6            \n\t"
757
        "wunpckelub wr6, wr7            \n\t"
758
        "wunpckehub wr7, wr7            \n\t"
759
        "wunpckelub wr8, wr10           \n\t"
760
        "wunpckehub wr9, wr10           \n\t"
761
        "wunpckelub wr10, wr11          \n\t"
762
        "wunpckehub wr11, wr11          \n\t"
763
        "waddhus wr4, wr4, wr8          \n\t"
764
        "waddhus wr5, wr5, wr9          \n\t"
765
        "waddhus wr6, wr6, wr10         \n\t"
766
        "waddhus wr7, wr7, wr11         \n\t"
767
        "waddhus wr8, wr0, wr4          \n\t"
768
        "waddhus wr9, wr1, wr5          \n\t"
769
        "waddhus wr10, wr2, wr6         \n\t"
770
        "waddhus wr11, wr3, wr7         \n\t"
771
        "waddhus wr8, wr8, wr15         \n\t"
772
        "waddhus wr9, wr9, wr15         \n\t"
773
        "waddhus wr10, wr10, wr15       \n\t"
774
        "waddhus wr11, wr11, wr15       \n\t"
775
        "wsrlhg wr8, wr8, wcgr0         \n\t"
776
        "wsrlhg wr9, wr9, wcgr0         \n\t"
777
        "wsrlhg wr10, wr10, wcgr0       \n\t"
778
        "wsrlhg wr11, wr11, wcgr0       \n\t"
779
        "wpackhus wr8, wr8, wr9         \n\t"
780
        "wpackhus wr9, wr10, wr11       \n\t"
781
        "wstrd wr8, [%[block]]          \n\t"
782
        "wstrd wr9, [%[block], #8]      \n\t"
783
        "add %[block], %[block], %[line_size]   \n\t"
784

    
785
        // [wr0 wr1 wr2 wr3] <= *
786
        // [wr4 wr5 wr6 wr7]
787
        "wldrd wr12, [%[pixels]]        \n\t"
788
        "wldrd wr13, [%[pixels], #8]    \n\t"
789
        "wldrd wr14, [%[pixels], #16]   \n\t"
790
        "add %[pixels], %[pixels], %[line_size] \n\t"
791
        "walignr1 wr2, wr12, wr13       \n\t"
792
        "pld [%[pixels]]                \n\t"
793
        "pld [%[pixels], #32]           \n\t"
794
        "walignr1 wr3, wr13, wr14       \n\t"
795
        "wmoveq wr10, wr13              \n\t"
796
        "wmoveq wr11, wr14              \n\t"
797
        "walignr2ne wr10, wr12, wr13    \n\t"
798
        "walignr2ne wr11, wr13, wr14    \n\t"
799
        "wunpckelub wr0, wr2            \n\t"
800
        "wunpckehub wr1, wr2            \n\t"
801
        "wunpckelub wr2, wr3            \n\t"
802
        "wunpckehub wr3, wr3            \n\t"
803
        "wunpckelub wr8, wr10           \n\t"
804
        "wunpckehub wr9, wr10           \n\t"
805
        "wunpckelub wr10, wr11          \n\t"
806
        "wunpckehub wr11, wr11          \n\t"
807
        "waddhus wr0, wr0, wr8          \n\t"
808
        "waddhus wr1, wr1, wr9          \n\t"
809
        "waddhus wr2, wr2, wr10         \n\t"
810
        "waddhus wr3, wr3, wr11         \n\t"
811
        "waddhus wr8, wr0, wr4          \n\t"
812
        "waddhus wr9, wr1, wr5          \n\t"
813
        "waddhus wr10, wr2, wr6         \n\t"
814
        "waddhus wr11, wr3, wr7         \n\t"
815
        "waddhus wr8, wr8, wr15         \n\t"
816
        "waddhus wr9, wr9, wr15         \n\t"
817
        "waddhus wr10, wr10, wr15       \n\t"
818
        "waddhus wr11, wr11, wr15       \n\t"
819
        "wsrlhg wr8, wr8, wcgr0         \n\t"
820
        "wsrlhg wr9, wr9, wcgr0         \n\t"
821
        "wsrlhg wr10, wr10, wcgr0       \n\t"
822
        "wsrlhg wr11, wr11, wcgr0       \n\t"
823
        "wpackhus wr8, wr8, wr9         \n\t"
824
        "wpackhus wr9, wr10, wr11       \n\t"
825
        "wstrd wr8, [%[block]]          \n\t"
826
        "wstrd wr9, [%[block], #8]      \n\t"
827
        "add %[block], %[block], %[line_size]   \n\t"
828

    
829
        "subs %[h], %[h], #2            \n\t"
830
        "bne 1b                         \n\t"
831
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
832
        : [line_size]"r"(line_size)
833
        : "r12", "memory");
834
}
835

    
836
void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
837
{
838
    // [wr0 wr1 wr2 wr3] for previous line
839
    // [wr4 wr5 wr6 wr7] for current line
840
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
841
    __asm__ __volatile__(
842
        "pld [%[block]]                 \n\t"
843
        "pld [%[block], #32]            \n\t"
844
        "pld [%[pixels]]                \n\t"
845
        "mov r12, #2                    \n\t"
846
        "pld [%[pixels], #32]           \n\t"
847
        "tmcr wcgr0, r12                \n\t" /* for shift value */
848
        "and r12, %[pixels], #7         \n\t"
849
        "bic %[pixels], %[pixels], #7   \n\t"
850
        "tmcr wcgr1, r12                \n\t"
851

    
852
        // [wr0 wr1 wr2 wr3] <= *
853
        // [wr4 wr5 wr6 wr7]
854
        "wldrd wr12, [%[pixels]]        \n\t"
855
        "add r12, r12, #1               \n\t"
856
        "wldrd wr13, [%[pixels], #8]    \n\t"
857
        "tmcr wcgr2, r12                \n\t"
858
        "add %[pixels], %[pixels], %[line_size] \n\t"
859
        "cmp r12, #8                    \n\t"
860
        "pld [%[pixels]]                \n\t"
861
        "pld [%[pixels], #32]           \n\t"
862
        "walignr1 wr2, wr12, wr13       \n\t"
863
        "wmoveq wr10, wr13              \n\t"
864
        "walignr2ne wr10, wr12, wr13    \n\t"
865
        "wunpckelub wr0, wr2            \n\t"
866
        "wunpckehub wr1, wr2            \n\t"
867
        "wunpckelub wr8, wr10           \n\t"
868
        "wunpckehub wr9, wr10           \n\t"
869
        "waddhus wr0, wr0, wr8          \n\t"
870
        "waddhus wr1, wr1, wr9          \n\t"
871

    
872
        "1:                             \n\t"
873
        // [wr0 wr1 wr2 wr3]
874
        // [wr4 wr5 wr6 wr7] <= *
875
        "wldrd wr12, [%[pixels]]        \n\t"
876
        "cmp r12, #8                    \n\t"
877
        "wldrd wr13, [%[pixels], #8]    \n\t"
878
        "add %[pixels], %[pixels], %[line_size] \n\t"
879
        "walignr1 wr6, wr12, wr13       \n\t"
880
        "pld [%[pixels]]                \n\t"
881
        "pld [%[pixels], #32]           \n\t"
882
        "wmoveq wr10, wr13              \n\t"
883
        "walignr2ne wr10, wr12, wr13    \n\t"
884
        "wunpckelub wr4, wr6            \n\t"
885
        "wunpckehub wr5, wr6            \n\t"
886
        "wunpckelub wr8, wr10           \n\t"
887
        "wunpckehub wr9, wr10           \n\t"
888
        "waddhus wr4, wr4, wr8          \n\t"
889
        "waddhus wr5, wr5, wr9          \n\t"
890
        "waddhus wr8, wr0, wr4          \n\t"
891
        "waddhus wr9, wr1, wr5          \n\t"
892
        "waddhus wr8, wr8, wr15         \n\t"
893
        "waddhus wr9, wr9, wr15         \n\t"
894
        "wldrd wr12, [%[block]]         \n\t"
895
        "wsrlhg wr8, wr8, wcgr0         \n\t"
896
        "wsrlhg wr9, wr9, wcgr0         \n\t"
897
        "wpackhus wr8, wr8, wr9         \n\t"
898
        WAVG2B" wr8, wr8, wr12          \n\t"
899
        "wstrd wr8, [%[block]]          \n\t"
900
        "add %[block], %[block], %[line_size]   \n\t"
901
        "wldrd wr12, [%[pixels]]        \n\t"
902
        "pld [%[block]]                 \n\t"
903
        "pld [%[block], #32]            \n\t"
904

    
905
        // [wr0 wr1 wr2 wr3] <= *
906
        // [wr4 wr5 wr6 wr7]
907
        "wldrd wr13, [%[pixels], #8]    \n\t"
908
        "add %[pixels], %[pixels], %[line_size] \n\t"
909
        "walignr1 wr2, wr12, wr13       \n\t"
910
        "pld [%[pixels]]                \n\t"
911
        "pld [%[pixels], #32]           \n\t"
912
        "wmoveq wr10, wr13              \n\t"
913
        "walignr2ne wr10, wr12, wr13    \n\t"
914
        "wunpckelub wr0, wr2            \n\t"
915
        "wunpckehub wr1, wr2            \n\t"
916
        "wunpckelub wr8, wr10           \n\t"
917
        "wunpckehub wr9, wr10           \n\t"
918
        "waddhus wr0, wr0, wr8          \n\t"
919
        "waddhus wr1, wr1, wr9          \n\t"
920
        "waddhus wr8, wr0, wr4          \n\t"
921
        "waddhus wr9, wr1, wr5          \n\t"
922
        "waddhus wr8, wr8, wr15         \n\t"
923
        "waddhus wr9, wr9, wr15         \n\t"
924
        "wldrd wr12, [%[block]]         \n\t"
925
        "wsrlhg wr8, wr8, wcgr0         \n\t"
926
        "wsrlhg wr9, wr9, wcgr0         \n\t"
927
        "wpackhus wr8, wr8, wr9         \n\t"
928
        "subs %[h], %[h], #2            \n\t"
929
        WAVG2B" wr8, wr8, wr12          \n\t"
930
        "wstrd wr8, [%[block]]          \n\t"
931
        "add %[block], %[block], %[line_size]   \n\t"
932
        "pld [%[block]]                 \n\t"
933
        "pld [%[block], #32]            \n\t"
934
        "bne 1b                         \n\t"
935
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
936
        : [line_size]"r"(line_size)
937
        : "r12", "memory");
938
}
939

    
940
void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
941
{
942
    // [wr0 wr1 wr2 wr3] for previous line
943
    // [wr4 wr5 wr6 wr7] for current line
944
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
945
    __asm__ __volatile__(
946
        "pld [%[block]]                 \n\t"
947
        "pld [%[block], #32]            \n\t"
948
        "pld [%[pixels]]                \n\t"
949
        "mov r12, #2                    \n\t"
950
        "pld [%[pixels], #32]           \n\t"
951
        "tmcr wcgr0, r12                \n\t" /* for shift value */
952
        /* alignment */
953
        "and r12, %[pixels], #7         \n\t"
954
        "bic %[pixels], %[pixels], #7           \n\t"
955
        "tmcr wcgr1, r12                \n\t"
956
        "add r12, r12, #1               \n\t"
957
        "tmcr wcgr2, r12                \n\t"
958

    
959
        // [wr0 wr1 wr2 wr3] <= *
960
        // [wr4 wr5 wr6 wr7]
961
        "wldrd wr12, [%[pixels]]        \n\t"
962
        "cmp r12, #8                    \n\t"
963
        "wldrd wr13, [%[pixels], #8]    \n\t"
964
        "wldrd wr14, [%[pixels], #16]   \n\t"
965
        "add %[pixels], %[pixels], %[line_size] \n\t"
966
        "pld [%[pixels]]                \n\t"
967
        "walignr1 wr2, wr12, wr13       \n\t"
968
        "pld [%[pixels], #32]           \n\t"
969
        "walignr1 wr3, wr13, wr14       \n\t"
970
        "wmoveq wr10, wr13              \n\t"
971
        "wmoveq wr11, wr14              \n\t"
972
        "walignr2ne wr10, wr12, wr13    \n\t"
973
        "walignr2ne wr11, wr13, wr14    \n\t"
974
        "wunpckelub wr0, wr2            \n\t"
975
        "wunpckehub wr1, wr2            \n\t"
976
        "wunpckelub wr2, wr3            \n\t"
977
        "wunpckehub wr3, wr3            \n\t"
978
        "wunpckelub wr8, wr10           \n\t"
979
        "wunpckehub wr9, wr10           \n\t"
980
        "wunpckelub wr10, wr11          \n\t"
981
        "wunpckehub wr11, wr11          \n\t"
982
        "waddhus wr0, wr0, wr8          \n\t"
983
        "waddhus wr1, wr1, wr9          \n\t"
984
        "waddhus wr2, wr2, wr10         \n\t"
985
        "waddhus wr3, wr3, wr11         \n\t"
986

    
987
        "1:                             \n\t"
988
        // [wr0 wr1 wr2 wr3]
989
        // [wr4 wr5 wr6 wr7] <= *
990
        "wldrd wr12, [%[pixels]]        \n\t"
991
        "cmp r12, #8                    \n\t"
992
        "wldrd wr13, [%[pixels], #8]    \n\t"
993
        "wldrd wr14, [%[pixels], #16]   \n\t"
994
        "add %[pixels], %[pixels], %[line_size] \n\t"
995
        "walignr1 wr6, wr12, wr13       \n\t"
996
        "pld [%[pixels]]                \n\t"
997
        "pld [%[pixels], #32]           \n\t"
998
        "walignr1 wr7, wr13, wr14       \n\t"
999
        "wmoveq wr10, wr13              \n\t"
1000
        "wmoveq wr11, wr14              \n\t"
1001
        "walignr2ne wr10, wr12, wr13    \n\t"
1002
        "walignr2ne wr11, wr13, wr14    \n\t"
1003
        "wunpckelub wr4, wr6            \n\t"
1004
        "wunpckehub wr5, wr6            \n\t"
1005
        "wunpckelub wr6, wr7            \n\t"
1006
        "wunpckehub wr7, wr7            \n\t"
1007
        "wunpckelub wr8, wr10           \n\t"
1008
        "wunpckehub wr9, wr10           \n\t"
1009
        "wunpckelub wr10, wr11          \n\t"
1010
        "wunpckehub wr11, wr11          \n\t"
1011
        "waddhus wr4, wr4, wr8          \n\t"
1012
        "waddhus wr5, wr5, wr9          \n\t"
1013
        "waddhus wr6, wr6, wr10         \n\t"
1014
        "waddhus wr7, wr7, wr11         \n\t"
1015
        "waddhus wr8, wr0, wr4          \n\t"
1016
        "waddhus wr9, wr1, wr5          \n\t"
1017
        "waddhus wr10, wr2, wr6         \n\t"
1018
        "waddhus wr11, wr3, wr7         \n\t"
1019
        "waddhus wr8, wr8, wr15         \n\t"
1020
        "waddhus wr9, wr9, wr15         \n\t"
1021
        "waddhus wr10, wr10, wr15       \n\t"
1022
        "waddhus wr11, wr11, wr15       \n\t"
1023
        "wsrlhg wr8, wr8, wcgr0         \n\t"
1024
        "wsrlhg wr9, wr9, wcgr0         \n\t"
1025
        "wldrd wr12, [%[block]]         \n\t"
1026
        "wldrd wr13, [%[block], #8]     \n\t"
1027
        "wsrlhg wr10, wr10, wcgr0       \n\t"
1028
        "wsrlhg wr11, wr11, wcgr0       \n\t"
1029
        "wpackhus wr8, wr8, wr9         \n\t"
1030
        "wpackhus wr9, wr10, wr11       \n\t"
1031
        WAVG2B" wr8, wr8, wr12          \n\t"
1032
        WAVG2B" wr9, wr9, wr13          \n\t"
1033
        "wstrd wr8, [%[block]]          \n\t"
1034
        "wstrd wr9, [%[block], #8]      \n\t"
1035
        "add %[block], %[block], %[line_size]   \n\t"
1036

    
1037
        // [wr0 wr1 wr2 wr3] <= *
1038
        // [wr4 wr5 wr6 wr7]
1039
        "wldrd wr12, [%[pixels]]        \n\t"
1040
        "pld [%[block]]                 \n\t"
1041
        "wldrd wr13, [%[pixels], #8]    \n\t"
1042
        "pld [%[block], #32]            \n\t"
1043
        "wldrd wr14, [%[pixels], #16]   \n\t"
1044
        "add %[pixels], %[pixels], %[line_size] \n\t"
1045
        "walignr1 wr2, wr12, wr13       \n\t"
1046
        "pld [%[pixels]]                \n\t"
1047
        "pld [%[pixels], #32]           \n\t"
1048
        "walignr1 wr3, wr13, wr14       \n\t"
1049
        "wmoveq wr10, wr13              \n\t"
1050
        "wmoveq wr11, wr14              \n\t"
1051
        "walignr2ne wr10, wr12, wr13    \n\t"
1052
        "walignr2ne wr11, wr13, wr14    \n\t"
1053
        "wunpckelub wr0, wr2            \n\t"
1054
        "wunpckehub wr1, wr2            \n\t"
1055
        "wunpckelub wr2, wr3            \n\t"
1056
        "wunpckehub wr3, wr3            \n\t"
1057
        "wunpckelub wr8, wr10           \n\t"
1058
        "wunpckehub wr9, wr10           \n\t"
1059
        "wunpckelub wr10, wr11          \n\t"
1060
        "wunpckehub wr11, wr11          \n\t"
1061
        "waddhus wr0, wr0, wr8          \n\t"
1062
        "waddhus wr1, wr1, wr9          \n\t"
1063
        "waddhus wr2, wr2, wr10         \n\t"
1064
        "waddhus wr3, wr3, wr11         \n\t"
1065
        "waddhus wr8, wr0, wr4          \n\t"
1066
        "waddhus wr9, wr1, wr5          \n\t"
1067
        "waddhus wr10, wr2, wr6         \n\t"
1068
        "waddhus wr11, wr3, wr7         \n\t"
1069
        "waddhus wr8, wr8, wr15         \n\t"
1070
        "waddhus wr9, wr9, wr15         \n\t"
1071
        "waddhus wr10, wr10, wr15       \n\t"
1072
        "waddhus wr11, wr11, wr15       \n\t"
1073
        "wsrlhg wr8, wr8, wcgr0         \n\t"
1074
        "wsrlhg wr9, wr9, wcgr0         \n\t"
1075
        "wldrd wr12, [%[block]]         \n\t"
1076
        "wldrd wr13, [%[block], #8]     \n\t"
1077
        "wsrlhg wr10, wr10, wcgr0       \n\t"
1078
        "wsrlhg wr11, wr11, wcgr0       \n\t"
1079
        "wpackhus wr8, wr8, wr9         \n\t"
1080
        "wpackhus wr9, wr10, wr11       \n\t"
1081
        WAVG2B" wr8, wr8, wr12          \n\t"
1082
        WAVG2B" wr9, wr9, wr13          \n\t"
1083
        "wstrd wr8, [%[block]]          \n\t"
1084
        "wstrd wr9, [%[block], #8]      \n\t"
1085
        "add %[block], %[block], %[line_size]   \n\t"
1086
        "subs %[h], %[h], #2            \n\t"
1087
        "pld [%[block]]                 \n\t"
1088
        "pld [%[block], #32]            \n\t"
1089
        "bne 1b                         \n\t"
1090
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
1091
        : [line_size]"r"(line_size)
1092
        : "r12", "memory");
1093
}