Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / dsputil_iwmmxt_rnd_template.c @ 2912e87a

History | View | Annotate | Download (47 KB)

1 04d7f601 Diego Biurrun
/*
2
 * iWMMXt optimized DSP utils
3
 * copyright (c) 2004 AGAWA Koji
4
 *
5 2912e87a Mans Rullgard
 * This file is part of Libav.
6 b78e7197 Diego Biurrun
 *
7 2912e87a Mans Rullgard
 * Libav is free software; you can redistribute it and/or
8 04d7f601 Diego Biurrun
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
11 04d7f601 Diego Biurrun
 *
12 2912e87a Mans Rullgard
 * Libav is distributed in the hope that it will be useful,
13 04d7f601 Diego Biurrun
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18 2912e87a Mans Rullgard
 * License along with Libav; if not, write to the Free Software
19 04d7f601 Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22 6ad1fa5a Bernhard Rosenkränzer
void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
23
{
24
    int stride = line_size;
25 be449fca Diego Pettenò
    __asm__ volatile (
26 6ad1fa5a Bernhard Rosenkränzer
        "and r12, %[pixels], #7 \n\t"
27
        "bic %[pixels], %[pixels], #7 \n\t"
28
        "tmcr wcgr1, r12 \n\t"
29
        "add r4, %[pixels], %[line_size] \n\t"
30
        "add r5, %[block], %[line_size] \n\t"
31
        "mov %[line_size], %[line_size], lsl #1 \n\t"
32
        "1: \n\t"
33
        "wldrd wr0, [%[pixels]] \n\t"
34
        "subs %[h], %[h], #2 \n\t"
35
        "wldrd wr1, [%[pixels], #8] \n\t"
36
        "add %[pixels], %[pixels], %[line_size] \n\t"
37
        "wldrd wr3, [r4] \n\t"
38
        "pld [%[pixels]] \n\t"
39
        "pld [%[pixels], #32] \n\t"
40
        "wldrd wr4, [r4, #8] \n\t"
41
        "add r4, r4, %[line_size] \n\t"
42
        "walignr1 wr8, wr0, wr1 \n\t"
43
        "pld [r4] \n\t"
44
        "pld [r4, #32] \n\t"
45
        "walignr1 wr10, wr3, wr4 \n\t"
46
        "wstrd wr8, [%[block]] \n\t"
47
        "add %[block], %[block], %[line_size] \n\t"
48
        "wstrd wr10, [r5] \n\t"
49
        "add r5, r5, %[line_size] \n\t"
50
        "bne 1b \n\t"
51
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
52
        :
53
        : "memory", "r4", "r5", "r12");
54
}
55
56
void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
57
{
58
    int stride = line_size;
59 be449fca Diego Pettenò
    __asm__ volatile (
60 6ad1fa5a Bernhard Rosenkränzer
        "and r12, %[pixels], #7 \n\t"
61
        "bic %[pixels], %[pixels], #7 \n\t"
62
        "tmcr wcgr1, r12 \n\t"
63
        "add r4, %[pixels], %[line_size] \n\t"
64
        "add r5, %[block], %[line_size] \n\t"
65
        "mov %[line_size], %[line_size], lsl #1 \n\t"
66
        "1: \n\t"
67
        "wldrd wr0, [%[pixels]] \n\t"
68
        "subs %[h], %[h], #2 \n\t"
69
        "wldrd wr1, [%[pixels], #8] \n\t"
70
        "add %[pixels], %[pixels], %[line_size] \n\t"
71
        "wldrd wr3, [r4] \n\t"
72
        "pld [%[pixels]] \n\t"
73
        "pld [%[pixels], #32] \n\t"
74
        "wldrd wr4, [r4, #8] \n\t"
75
        "add r4, r4, %[line_size] \n\t"
76
        "walignr1 wr8, wr0, wr1 \n\t"
77
        "wldrd wr0, [%[block]] \n\t"
78
        "wldrd wr2, [r5] \n\t"
79
        "pld [r4] \n\t"
80
        "pld [r4, #32] \n\t"
81
        "walignr1 wr10, wr3, wr4 \n\t"
82
        WAVG2B" wr8, wr8, wr0 \n\t"
83
        WAVG2B" wr10, wr10, wr2 \n\t"
84
        "wstrd wr8, [%[block]] \n\t"
85
        "add %[block], %[block], %[line_size] \n\t"
86
        "wstrd wr10, [r5] \n\t"
87
        "pld [%[block]] \n\t"
88
        "pld [%[block], #32] \n\t"
89
        "add r5, r5, %[line_size] \n\t"
90
        "pld [r5] \n\t"
91
        "pld [r5, #32] \n\t"
92
        "bne 1b \n\t"
93
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
94
        :
95
        : "memory", "r4", "r5", "r12");
96
}
97
98
void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
99
{
100
    int stride = line_size;
101 be449fca Diego Pettenò
    __asm__ volatile (
102 6ad1fa5a Bernhard Rosenkränzer
        "and r12, %[pixels], #7 \n\t"
103
        "bic %[pixels], %[pixels], #7 \n\t"
104
        "tmcr wcgr1, r12 \n\t"
105
        "add r4, %[pixels], %[line_size] \n\t"
106
        "add r5, %[block], %[line_size] \n\t"
107
        "mov %[line_size], %[line_size], lsl #1 \n\t"
108
        "1: \n\t"
109
        "wldrd wr0, [%[pixels]] \n\t"
110
        "wldrd wr1, [%[pixels], #8] \n\t"
111
        "subs %[h], %[h], #2 \n\t"
112
        "wldrd wr2, [%[pixels], #16] \n\t"
113
        "add %[pixels], %[pixels], %[line_size] \n\t"
114
        "wldrd wr3, [r4] \n\t"
115
        "pld [%[pixels]] \n\t"
116
        "pld [%[pixels], #32] \n\t"
117
        "walignr1 wr8, wr0, wr1 \n\t"
118
        "wldrd wr4, [r4, #8] \n\t"
119
        "walignr1 wr9, wr1, wr2 \n\t"
120
        "wldrd wr5, [r4, #16] \n\t"
121
        "add r4, r4, %[line_size] \n\t"
122
        "pld [r4] \n\t"
123
        "pld [r4, #32] \n\t"
124
        "walignr1 wr10, wr3, wr4 \n\t"
125
        "wstrd wr8, [%[block]] \n\t"
126
        "walignr1 wr11, wr4, wr5 \n\t"
127
        "wstrd wr9, [%[block], #8] \n\t"
128
        "add %[block], %[block], %[line_size] \n\t"
129
        "wstrd wr10, [r5] \n\t"
130
        "wstrd wr11, [r5, #8] \n\t"
131
        "add r5, r5, %[line_size] \n\t"
132
        "bne 1b \n\t"
133
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
134
        :
135
        : "memory", "r4", "r5", "r12");
136
}
137
138
void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
139
{
140
    int stride = line_size;
141 be449fca Diego Pettenò
    __asm__ volatile (
142 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
143
        "pld [%[pixels], #32]           \n\t"
144
        "pld [%[block]]                 \n\t"
145
        "pld [%[block], #32]            \n\t"
146
        "and r12, %[pixels], #7         \n\t"
147
        "bic %[pixels], %[pixels], #7   \n\t"
148
        "tmcr wcgr1, r12                \n\t"
149
        "add r4, %[pixels], %[line_size]\n\t"
150
        "add r5, %[block], %[line_size] \n\t"
151
        "mov %[line_size], %[line_size], lsl #1 \n\t"
152
        "1:                             \n\t"
153
        "wldrd wr0, [%[pixels]]         \n\t"
154
        "wldrd wr1, [%[pixels], #8]     \n\t"
155
        "subs %[h], %[h], #2            \n\t"
156
        "wldrd wr2, [%[pixels], #16]    \n\t"
157
        "add %[pixels], %[pixels], %[line_size] \n\t"
158
        "wldrd wr3, [r4]                \n\t"
159
        "pld [%[pixels]]                \n\t"
160
        "pld [%[pixels], #32]           \n\t"
161
        "walignr1 wr8, wr0, wr1         \n\t"
162
        "wldrd wr4, [r4, #8]            \n\t"
163
        "walignr1 wr9, wr1, wr2         \n\t"
164
        "wldrd wr5, [r4, #16]           \n\t"
165
        "add r4, r4, %[line_size]       \n\t"
166
        "wldrd wr0, [%[block]]          \n\t"
167
        "pld [r4]                       \n\t"
168
        "wldrd wr1, [%[block], #8]      \n\t"
169
        "pld [r4, #32]                  \n\t"
170
        "wldrd wr2, [r5]                \n\t"
171
        "walignr1 wr10, wr3, wr4        \n\t"
172
        "wldrd wr3, [r5, #8]            \n\t"
173
        WAVG2B" wr8, wr8, wr0           \n\t"
174
        WAVG2B" wr9, wr9, wr1           \n\t"
175
        WAVG2B" wr10, wr10, wr2         \n\t"
176
        "wstrd wr8, [%[block]]          \n\t"
177
        "walignr1 wr11, wr4, wr5        \n\t"
178
        WAVG2B" wr11, wr11, wr3         \n\t"
179
        "wstrd wr9, [%[block], #8]      \n\t"
180
        "add %[block], %[block], %[line_size] \n\t"
181
        "wstrd wr10, [r5]               \n\t"
182
        "pld [%[block]]                 \n\t"
183
        "pld [%[block], #32]            \n\t"
184
        "wstrd wr11, [r5, #8]           \n\t"
185
        "add r5, r5, %[line_size]       \n\t"
186
        "pld [r5]                       \n\t"
187
        "pld [r5, #32]                  \n\t"
188
        "bne 1b \n\t"
189
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
190
        :
191
        : "memory", "r4", "r5", "r12");
192
}
193
194
void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
195
{
196
    int stride = line_size;
197
    // [wr0 wr1 wr2 wr3] for previous line
198
    // [wr4 wr5 wr6 wr7] for current line
199
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
200 be449fca Diego Pettenò
    __asm__ volatile(
201 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
202
        "pld [%[pixels], #32]           \n\t"
203
        "and r12, %[pixels], #7         \n\t"
204
        "bic %[pixels], %[pixels], #7   \n\t"
205
        "tmcr wcgr1, r12                \n\t"
206
        "add r12, r12, #1               \n\t"
207
        "add r4, %[pixels], %[line_size]\n\t"
208
        "tmcr wcgr2, r12                \n\t"
209
        "add r5, %[block], %[line_size] \n\t"
210
        "mov %[line_size], %[line_size], lsl #1 \n\t"
211
212
        "1:                             \n\t"
213
        "wldrd wr10, [%[pixels]]        \n\t"
214
        "cmp r12, #8                    \n\t"
215
        "wldrd wr11, [%[pixels], #8]    \n\t"
216
        "add %[pixels], %[pixels], %[line_size] \n\t"
217
        "wldrd wr13, [r4]               \n\t"
218
        "pld [%[pixels]]                \n\t"
219
        "wldrd wr14, [r4, #8]           \n\t"
220
        "pld [%[pixels], #32]           \n\t"
221
        "add r4, r4, %[line_size]       \n\t"
222
        "walignr1 wr0, wr10, wr11       \n\t"
223
        "pld [r4]                       \n\t"
224
        "pld [r4, #32]                  \n\t"
225
        "walignr1 wr2, wr13, wr14       \n\t"
226
        "wmoveq wr4, wr11               \n\t"
227
        "wmoveq wr6, wr14               \n\t"
228
        "walignr2ne wr4, wr10, wr11     \n\t"
229
        "walignr2ne wr6, wr13, wr14     \n\t"
230
        WAVG2B" wr0, wr0, wr4           \n\t"
231
        WAVG2B" wr2, wr2, wr6           \n\t"
232
        "wstrd wr0, [%[block]]          \n\t"
233
        "subs %[h], %[h], #2            \n\t"
234
        "wstrd wr2, [r5]                \n\t"
235
        "add %[block], %[block], %[line_size]   \n\t"
236
        "add r5, r5, %[line_size]       \n\t"
237
        "bne 1b                         \n\t"
238
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
239
        :
240
        : "r4", "r5", "r12", "memory");
241
}
242
243
void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
244
{
245
    int stride = line_size;
246
    // [wr0 wr1 wr2 wr3] for previous line
247
    // [wr4 wr5 wr6 wr7] for current line
248
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
249 be449fca Diego Pettenò
    __asm__ volatile(
250 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
251
        "pld [%[pixels], #32]           \n\t"
252
        "and r12, %[pixels], #7         \n\t"
253
        "bic %[pixels], %[pixels], #7   \n\t"
254
        "tmcr wcgr1, r12                \n\t"
255
        "add r12, r12, #1               \n\t"
256
        "add r4, %[pixels], %[line_size]\n\t"
257
        "tmcr wcgr2, r12                \n\t"
258
        "add r5, %[block], %[line_size] \n\t"
259
        "mov %[line_size], %[line_size], lsl #1 \n\t"
260
261
        "1:                             \n\t"
262
        "wldrd wr10, [%[pixels]]        \n\t"
263
        "cmp r12, #8                    \n\t"
264
        "wldrd wr11, [%[pixels], #8]    \n\t"
265
        "wldrd wr12, [%[pixels], #16]   \n\t"
266
        "add %[pixels], %[pixels], %[line_size] \n\t"
267
        "wldrd wr13, [r4]               \n\t"
268
        "pld [%[pixels]]                \n\t"
269
        "wldrd wr14, [r4, #8]           \n\t"
270
        "pld [%[pixels], #32]           \n\t"
271
        "wldrd wr15, [r4, #16]          \n\t"
272
        "add r4, r4, %[line_size]       \n\t"
273
        "walignr1 wr0, wr10, wr11       \n\t"
274
        "pld [r4]                       \n\t"
275
        "pld [r4, #32]                  \n\t"
276
        "walignr1 wr1, wr11, wr12       \n\t"
277
        "walignr1 wr2, wr13, wr14       \n\t"
278
        "walignr1 wr3, wr14, wr15       \n\t"
279
        "wmoveq wr4, wr11               \n\t"
280
        "wmoveq wr5, wr12               \n\t"
281
        "wmoveq wr6, wr14               \n\t"
282
        "wmoveq wr7, wr15               \n\t"
283
        "walignr2ne wr4, wr10, wr11     \n\t"
284
        "walignr2ne wr5, wr11, wr12     \n\t"
285
        "walignr2ne wr6, wr13, wr14     \n\t"
286
        "walignr2ne wr7, wr14, wr15     \n\t"
287
        WAVG2B" wr0, wr0, wr4           \n\t"
288
        WAVG2B" wr1, wr1, wr5           \n\t"
289
        "wstrd wr0, [%[block]]          \n\t"
290
        WAVG2B" wr2, wr2, wr6           \n\t"
291
        "wstrd wr1, [%[block], #8]      \n\t"
292
        WAVG2B" wr3, wr3, wr7           \n\t"
293
        "add %[block], %[block], %[line_size]   \n\t"
294
        "wstrd wr2, [r5]                \n\t"
295
        "subs %[h], %[h], #2            \n\t"
296
        "wstrd wr3, [r5, #8]            \n\t"
297
        "add r5, r5, %[line_size]       \n\t"
298
        "bne 1b                         \n\t"
299
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
300
        :
301
        : "r4", "r5", "r12", "memory");
302
}
303
304
void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
305
{
306
    int stride = line_size;
307
    // [wr0 wr1 wr2 wr3] for previous line
308
    // [wr4 wr5 wr6 wr7] for current line
309
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
310 be449fca Diego Pettenò
    __asm__ volatile(
311 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
312
        "pld [%[pixels], #32]           \n\t"
313
        "pld [%[block]]                 \n\t"
314
        "pld [%[block], #32]            \n\t"
315
        "and r12, %[pixels], #7         \n\t"
316
        "bic %[pixels], %[pixels], #7   \n\t"
317
        "tmcr wcgr1, r12                \n\t"
318
        "add r12, r12, #1               \n\t"
319
        "add r4, %[pixels], %[line_size]\n\t"
320
        "tmcr wcgr2, r12                \n\t"
321
        "add r5, %[block], %[line_size] \n\t"
322
        "mov %[line_size], %[line_size], lsl #1 \n\t"
323
        "pld [r5]                       \n\t"
324
        "pld [r5, #32]                  \n\t"
325
326
        "1:                             \n\t"
327
        "wldrd wr10, [%[pixels]]        \n\t"
328
        "cmp r12, #8                    \n\t"
329
        "wldrd wr11, [%[pixels], #8]    \n\t"
330
        "add %[pixels], %[pixels], %[line_size] \n\t"
331
        "wldrd wr13, [r4]               \n\t"
332
        "pld [%[pixels]]                \n\t"
333
        "wldrd wr14, [r4, #8]           \n\t"
334
        "pld [%[pixels], #32]           \n\t"
335
        "add r4, r4, %[line_size]       \n\t"
336
        "walignr1 wr0, wr10, wr11       \n\t"
337
        "pld [r4]                       \n\t"
338
        "pld [r4, #32]                  \n\t"
339
        "walignr1 wr2, wr13, wr14       \n\t"
340
        "wmoveq wr4, wr11               \n\t"
341
        "wmoveq wr6, wr14               \n\t"
342
        "walignr2ne wr4, wr10, wr11     \n\t"
343
        "wldrd wr10, [%[block]]         \n\t"
344
        "walignr2ne wr6, wr13, wr14     \n\t"
345
        "wldrd wr12, [r5]               \n\t"
346
        WAVG2B" wr0, wr0, wr4           \n\t"
347
        WAVG2B" wr2, wr2, wr6           \n\t"
348
        WAVG2B" wr0, wr0, wr10          \n\t"
349
        WAVG2B" wr2, wr2, wr12          \n\t"
350
        "wstrd wr0, [%[block]]          \n\t"
351
        "subs %[h], %[h], #2            \n\t"
352
        "wstrd wr2, [r5]                \n\t"
353
        "add %[block], %[block], %[line_size]   \n\t"
354
        "add r5, r5, %[line_size]       \n\t"
355
        "pld [%[block]]                 \n\t"
356
        "pld [%[block], #32]            \n\t"
357
        "pld [r5]                       \n\t"
358
        "pld [r5, #32]                  \n\t"
359
        "bne 1b                         \n\t"
360
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
361
        :
362
        : "r4", "r5", "r12", "memory");
363
}
364
365
void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
366
{
367
    int stride = line_size;
368
    // [wr0 wr1 wr2 wr3] for previous line
369
    // [wr4 wr5 wr6 wr7] for current line
370
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
371 be449fca Diego Pettenò
    __asm__ volatile(
372 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
373
        "pld [%[pixels], #32]           \n\t"
374
        "pld [%[block]]                 \n\t"
375
        "pld [%[block], #32]            \n\t"
376
        "and r12, %[pixels], #7         \n\t"
377
        "bic %[pixels], %[pixels], #7   \n\t"
378
        "tmcr wcgr1, r12                \n\t"
379
        "add r12, r12, #1               \n\t"
380
        "add r4, %[pixels], %[line_size]\n\t"
381
        "tmcr wcgr2, r12                \n\t"
382
        "add r5, %[block], %[line_size] \n\t"
383
        "mov %[line_size], %[line_size], lsl #1 \n\t"
384
        "pld [r5]                       \n\t"
385
        "pld [r5, #32]                  \n\t"
386
387
        "1:                             \n\t"
388
        "wldrd wr10, [%[pixels]]        \n\t"
389
        "cmp r12, #8                    \n\t"
390
        "wldrd wr11, [%[pixels], #8]    \n\t"
391
        "wldrd wr12, [%[pixels], #16]   \n\t"
392
        "add %[pixels], %[pixels], %[line_size] \n\t"
393
        "wldrd wr13, [r4]               \n\t"
394
        "pld [%[pixels]]                \n\t"
395
        "wldrd wr14, [r4, #8]           \n\t"
396
        "pld [%[pixels], #32]           \n\t"
397
        "wldrd wr15, [r4, #16]          \n\t"
398
        "add r4, r4, %[line_size]       \n\t"
399
        "walignr1 wr0, wr10, wr11       \n\t"
400
        "pld [r4]                       \n\t"
401
        "pld [r4, #32]                  \n\t"
402
        "walignr1 wr1, wr11, wr12       \n\t"
403
        "walignr1 wr2, wr13, wr14       \n\t"
404
        "walignr1 wr3, wr14, wr15       \n\t"
405
        "wmoveq wr4, wr11               \n\t"
406
        "wmoveq wr5, wr12               \n\t"
407
        "wmoveq wr6, wr14               \n\t"
408
        "wmoveq wr7, wr15               \n\t"
409
        "walignr2ne wr4, wr10, wr11     \n\t"
410
        "walignr2ne wr5, wr11, wr12     \n\t"
411
        "walignr2ne wr6, wr13, wr14     \n\t"
412
        "walignr2ne wr7, wr14, wr15     \n\t"
413
        "wldrd wr10, [%[block]]         \n\t"
414
        WAVG2B" wr0, wr0, wr4           \n\t"
415
        "wldrd wr11, [%[block], #8]     \n\t"
416
        WAVG2B" wr1, wr1, wr5           \n\t"
417
        "wldrd wr12, [r5]               \n\t"
418
        WAVG2B" wr2, wr2, wr6           \n\t"
419
        "wldrd wr13, [r5, #8]           \n\t"
420
        WAVG2B" wr3, wr3, wr7           \n\t"
421
        WAVG2B" wr0, wr0, wr10          \n\t"
422
        WAVG2B" wr1, wr1, wr11          \n\t"
423
        WAVG2B" wr2, wr2, wr12          \n\t"
424
        WAVG2B" wr3, wr3, wr13          \n\t"
425
        "wstrd wr0, [%[block]]          \n\t"
426
        "subs %[h], %[h], #2            \n\t"
427
        "wstrd wr1, [%[block], #8]      \n\t"
428
        "add %[block], %[block], %[line_size]   \n\t"
429
        "wstrd wr2, [r5]                \n\t"
430
        "pld [%[block]]                 \n\t"
431
        "wstrd wr3, [r5, #8]            \n\t"
432
        "add r5, r5, %[line_size]       \n\t"
433
        "pld [%[block], #32]            \n\t"
434
        "pld [r5]                       \n\t"
435
        "pld [r5, #32]                  \n\t"
436
        "bne 1b                         \n\t"
437
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
438
        :
439
        :"r4", "r5", "r12", "memory");
440
}
441
442
void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
443
{
444
    int stride = line_size;
445
    // [wr0 wr1 wr2 wr3] for previous line
446
    // [wr4 wr5 wr6 wr7] for current line
447 be449fca Diego Pettenò
    __asm__ volatile(
448 6ad1fa5a Bernhard Rosenkränzer
        "pld            [%[pixels]]                             \n\t"
449
        "pld            [%[pixels], #32]                        \n\t"
450
        "and            r12, %[pixels], #7                      \n\t"
451
        "tmcr           wcgr1, r12                              \n\t"
452
        "bic            %[pixels], %[pixels], #7                \n\t"
453
454
        "wldrd          wr10, [%[pixels]]                       \n\t"
455
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
456
        "pld            [%[block]]                              \n\t"
457
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
458
        "walignr1       wr0, wr10, wr11                         \n\t"
459
        "pld            [%[pixels]]                             \n\t"
460
        "pld            [%[pixels], #32]                        \n\t"
461
462
      "1:                                                       \n\t"
463
        "wldrd          wr10, [%[pixels]]                       \n\t"
464
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
465
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
466
        "pld            [%[pixels]]                             \n\t"
467
        "pld            [%[pixels], #32]                        \n\t"
468
        "walignr1       wr4, wr10, wr11                         \n\t"
469
        "wldrd          wr10, [%[block]]                        \n\t"
470
         WAVG2B"        wr8, wr0, wr4                           \n\t"
471
         WAVG2B"        wr8, wr8, wr10                          \n\t"
472
        "wstrd          wr8, [%[block]]                         \n\t"
473
        "add            %[block], %[block], %[line_size]        \n\t"
474
475
        "wldrd          wr10, [%[pixels]]                       \n\t"
476
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
477
        "pld            [%[block]]                              \n\t"
478
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
479
        "pld            [%[pixels]]                             \n\t"
480
        "pld            [%[pixels], #32]                        \n\t"
481
        "walignr1       wr0, wr10, wr11                         \n\t"
482
        "wldrd          wr10, [%[block]]                        \n\t"
483
         WAVG2B"        wr8, wr0, wr4                           \n\t"
484
         WAVG2B"        wr8, wr8, wr10                          \n\t"
485
        "wstrd          wr8, [%[block]]                         \n\t"
486
        "add            %[block], %[block], %[line_size]        \n\t"
487
488
        "subs           %[h], %[h], #2                          \n\t"
489
        "pld            [%[block]]                              \n\t"
490
        "bne            1b                                      \n\t"
491
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
492
        :
493
        : "cc", "memory", "r12");
494
}
495
496
void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
497
{
498
    int stride = line_size;
499
    // [wr0 wr1 wr2 wr3] for previous line
500
    // [wr4 wr5 wr6 wr7] for current line
501 be449fca Diego Pettenò
    __asm__ volatile(
502 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
503
        "pld [%[pixels], #32]           \n\t"
504
        "and r12, %[pixels], #7         \n\t"
505
        "tmcr wcgr1, r12                \n\t"
506
        "bic %[pixels], %[pixels], #7   \n\t"
507
508
        "wldrd wr10, [%[pixels]]        \n\t"
509
        "wldrd wr11, [%[pixels], #8]    \n\t"
510
        "wldrd wr12, [%[pixels], #16]   \n\t"
511
        "add %[pixels], %[pixels], %[line_size] \n\t"
512
        "pld [%[pixels]]                \n\t"
513
        "pld [%[pixels], #32]           \n\t"
514
        "walignr1 wr0, wr10, wr11       \n\t"
515
        "walignr1 wr1, wr11, wr12       \n\t"
516
517
        "1:                             \n\t"
518
        "wldrd wr10, [%[pixels]]        \n\t"
519
        "wldrd wr11, [%[pixels], #8]    \n\t"
520
        "wldrd wr12, [%[pixels], #16]   \n\t"
521
        "add %[pixels], %[pixels], %[line_size] \n\t"
522
        "pld [%[pixels]]                \n\t"
523
        "pld [%[pixels], #32]           \n\t"
524
        "walignr1 wr4, wr10, wr11       \n\t"
525
        "walignr1 wr5, wr11, wr12       \n\t"
526
        WAVG2B" wr8, wr0, wr4           \n\t"
527
        WAVG2B" wr9, wr1, wr5           \n\t"
528
        "wstrd wr8, [%[block]]          \n\t"
529
        "wstrd wr9, [%[block], #8]      \n\t"
530
        "add %[block], %[block], %[line_size]   \n\t"
531
532
        "wldrd wr10, [%[pixels]]        \n\t"
533
        "wldrd wr11, [%[pixels], #8]    \n\t"
534
        "wldrd wr12, [%[pixels], #16]   \n\t"
535
        "add %[pixels], %[pixels], %[line_size] \n\t"
536
        "pld [%[pixels]]                \n\t"
537
        "pld [%[pixels], #32]           \n\t"
538
        "walignr1 wr0, wr10, wr11       \n\t"
539
        "walignr1 wr1, wr11, wr12       \n\t"
540
        WAVG2B" wr8, wr0, wr4           \n\t"
541
        WAVG2B" wr9, wr1, wr5           \n\t"
542
        "wstrd wr8, [%[block]]          \n\t"
543
        "wstrd wr9, [%[block], #8]      \n\t"
544
        "add %[block], %[block], %[line_size]   \n\t"
545
546
        "subs %[h], %[h], #2            \n\t"
547
        "bne 1b                         \n\t"
548
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
549
        :
550
        : "r4", "r5", "r12", "memory");
551
}
552
553
void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
554
{
555
    int stride = line_size;
556
    // [wr0 wr1 wr2 wr3] for previous line
557
    // [wr4 wr5 wr6 wr7] for current line
558 be449fca Diego Pettenò
    __asm__ volatile(
559 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
560
        "pld [%[pixels], #32]           \n\t"
561
        "and r12, %[pixels], #7         \n\t"
562
        "tmcr wcgr1, r12                \n\t"
563
        "bic %[pixels], %[pixels], #7   \n\t"
564
565
        "wldrd wr10, [%[pixels]]        \n\t"
566
        "wldrd wr11, [%[pixels], #8]    \n\t"
567
        "pld [%[block]]                 \n\t"
568
        "wldrd wr12, [%[pixels], #16]   \n\t"
569
        "add %[pixels], %[pixels], %[line_size] \n\t"
570
        "pld [%[pixels]]                \n\t"
571
        "pld [%[pixels], #32]           \n\t"
572
        "walignr1 wr0, wr10, wr11       \n\t"
573
        "walignr1 wr1, wr11, wr12       \n\t"
574
575
        "1:                             \n\t"
576
        "wldrd wr10, [%[pixels]]        \n\t"
577
        "wldrd wr11, [%[pixels], #8]    \n\t"
578
        "wldrd wr12, [%[pixels], #16]   \n\t"
579
        "add %[pixels], %[pixels], %[line_size] \n\t"
580
        "pld [%[pixels]]                \n\t"
581
        "pld [%[pixels], #32]           \n\t"
582
        "walignr1 wr4, wr10, wr11       \n\t"
583
        "walignr1 wr5, wr11, wr12       \n\t"
584
        "wldrd wr10, [%[block]]         \n\t"
585
        "wldrd wr11, [%[block], #8]     \n\t"
586
        WAVG2B" wr8, wr0, wr4           \n\t"
587
        WAVG2B" wr9, wr1, wr5           \n\t"
588
        WAVG2B" wr8, wr8, wr10          \n\t"
589
        WAVG2B" wr9, wr9, wr11          \n\t"
590
        "wstrd wr8, [%[block]]          \n\t"
591
        "wstrd wr9, [%[block], #8]      \n\t"
592
        "add %[block], %[block], %[line_size]   \n\t"
593
594
        "wldrd wr10, [%[pixels]]        \n\t"
595
        "wldrd wr11, [%[pixels], #8]    \n\t"
596
        "pld [%[block]]                 \n\t"
597
        "wldrd wr12, [%[pixels], #16]   \n\t"
598
        "add %[pixels], %[pixels], %[line_size] \n\t"
599
        "pld [%[pixels]]                \n\t"
600
        "pld [%[pixels], #32]           \n\t"
601
        "walignr1 wr0, wr10, wr11       \n\t"
602
        "walignr1 wr1, wr11, wr12       \n\t"
603
        "wldrd wr10, [%[block]]         \n\t"
604
        "wldrd wr11, [%[block], #8]     \n\t"
605
        WAVG2B" wr8, wr0, wr4           \n\t"
606
        WAVG2B" wr9, wr1, wr5           \n\t"
607
        WAVG2B" wr8, wr8, wr10          \n\t"
608
        WAVG2B" wr9, wr9, wr11          \n\t"
609
        "wstrd wr8, [%[block]]          \n\t"
610
        "wstrd wr9, [%[block], #8]      \n\t"
611
        "add %[block], %[block], %[line_size]   \n\t"
612
613
        "subs %[h], %[h], #2            \n\t"
614
        "pld [%[block]]                 \n\t"
615
        "bne 1b                         \n\t"
616
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
617
        :
618
        : "r4", "r5", "r12", "memory");
619
}
620
621
void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
622
{
623
    // [wr0 wr1 wr2 wr3] for previous line
624
    // [wr4 wr5 wr6 wr7] for current line
625
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
626 be449fca Diego Pettenò
    __asm__ volatile(
627 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
628
        "mov r12, #2                    \n\t"
629
        "pld [%[pixels], #32]           \n\t"
630
        "tmcr wcgr0, r12                \n\t" /* for shift value */
631
        "and r12, %[pixels], #7         \n\t"
632
        "bic %[pixels], %[pixels], #7   \n\t"
633
        "tmcr wcgr1, r12                \n\t"
634
635
        // [wr0 wr1 wr2 wr3] <= *
636
        // [wr4 wr5 wr6 wr7]
637
        "wldrd wr12, [%[pixels]]        \n\t"
638
        "add r12, r12, #1               \n\t"
639
        "wldrd wr13, [%[pixels], #8]    \n\t"
640
        "tmcr wcgr2, r12                \n\t"
641
        "add %[pixels], %[pixels], %[line_size] \n\t"
642
        "cmp r12, #8                    \n\t"
643
        "pld [%[pixels]]                \n\t"
644
        "pld [%[pixels], #32]           \n\t"
645
        "walignr1 wr2, wr12, wr13       \n\t"
646
        "wmoveq wr10, wr13              \n\t"
647
        "walignr2ne wr10, wr12, wr13    \n\t"
648
        "wunpckelub wr0, wr2            \n\t"
649
        "wunpckehub wr1, wr2            \n\t"
650
        "wunpckelub wr8, wr10           \n\t"
651
        "wunpckehub wr9, wr10           \n\t"
652
        "waddhus wr0, wr0, wr8          \n\t"
653
        "waddhus wr1, wr1, wr9          \n\t"
654
655
        "1:                             \n\t"
656
        // [wr0 wr1 wr2 wr3]
657
        // [wr4 wr5 wr6 wr7] <= *
658
        "wldrd wr12, [%[pixels]]        \n\t"
659
        "cmp r12, #8                    \n\t"
660
        "wldrd wr13, [%[pixels], #8]    \n\t"
661
        "add %[pixels], %[pixels], %[line_size] \n\t"
662
        "walignr1 wr6, wr12, wr13       \n\t"
663
        "pld [%[pixels]]                \n\t"
664
        "pld [%[pixels], #32]           \n\t"
665
        "wmoveq wr10, wr13              \n\t"
666
        "walignr2ne wr10, wr12, wr13    \n\t"
667
        "wunpckelub wr4, wr6            \n\t"
668
        "wunpckehub wr5, wr6            \n\t"
669
        "wunpckelub wr8, wr10           \n\t"
670
        "wunpckehub wr9, wr10           \n\t"
671
        "waddhus wr4, wr4, wr8          \n\t"
672
        "waddhus wr5, wr5, wr9          \n\t"
673
        "waddhus wr8, wr0, wr4          \n\t"
674
        "waddhus wr9, wr1, wr5          \n\t"
675
        "waddhus wr8, wr8, wr15         \n\t"
676
        "waddhus wr9, wr9, wr15         \n\t"
677
        "wsrlhg wr8, wr8, wcgr0         \n\t"
678
        "wsrlhg wr9, wr9, wcgr0         \n\t"
679
        "wpackhus wr8, wr8, wr9         \n\t"
680
        "wstrd wr8, [%[block]]          \n\t"
681
        "add %[block], %[block], %[line_size]   \n\t"
682
683
        // [wr0 wr1 wr2 wr3] <= *
684
        // [wr4 wr5 wr6 wr7]
685
        "wldrd wr12, [%[pixels]]        \n\t"
686
        "wldrd wr13, [%[pixels], #8]    \n\t"
687
        "add %[pixels], %[pixels], %[line_size] \n\t"
688
        "walignr1 wr2, wr12, wr13       \n\t"
689
        "pld [%[pixels]]                \n\t"
690
        "pld [%[pixels], #32]           \n\t"
691
        "wmoveq wr10, wr13              \n\t"
692
        "walignr2ne wr10, wr12, wr13    \n\t"
693
        "wunpckelub wr0, wr2            \n\t"
694
        "wunpckehub wr1, wr2            \n\t"
695
        "wunpckelub wr8, wr10           \n\t"
696
        "wunpckehub wr9, wr10           \n\t"
697
        "waddhus wr0, wr0, wr8          \n\t"
698
        "waddhus wr1, wr1, wr9          \n\t"
699
        "waddhus wr8, wr0, wr4          \n\t"
700
        "waddhus wr9, wr1, wr5          \n\t"
701
        "waddhus wr8, wr8, wr15         \n\t"
702
        "waddhus wr9, wr9, wr15         \n\t"
703
        "wsrlhg wr8, wr8, wcgr0         \n\t"
704
        "wsrlhg wr9, wr9, wcgr0         \n\t"
705
        "wpackhus wr8, wr8, wr9         \n\t"
706
        "subs %[h], %[h], #2            \n\t"
707
        "wstrd wr8, [%[block]]          \n\t"
708
        "add %[block], %[block], %[line_size]   \n\t"
709
        "bne 1b                         \n\t"
710
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
711
        : [line_size]"r"(line_size)
712
        : "r12", "memory");
713
}
714
715
void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
716
{
717
    // [wr0 wr1 wr2 wr3] for previous line
718
    // [wr4 wr5 wr6 wr7] for current line
719
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
720 be449fca Diego Pettenò
    __asm__ volatile(
721 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
722
        "mov r12, #2                    \n\t"
723
        "pld [%[pixels], #32]           \n\t"
724
        "tmcr wcgr0, r12                \n\t" /* for shift value */
725
        /* alignment */
726
        "and r12, %[pixels], #7         \n\t"
727
        "bic %[pixels], %[pixels], #7   \n\t"
728
        "tmcr wcgr1, r12                \n\t"
729
        "add r12, r12, #1               \n\t"
730
        "tmcr wcgr2, r12                \n\t"
731
732
        // [wr0 wr1 wr2 wr3] <= *
733
        // [wr4 wr5 wr6 wr7]
734
        "wldrd wr12, [%[pixels]]        \n\t"
735
        "cmp r12, #8                    \n\t"
736
        "wldrd wr13, [%[pixels], #8]    \n\t"
737
        "wldrd wr14, [%[pixels], #16]   \n\t"
738
        "add %[pixels], %[pixels], %[line_size] \n\t"
739
        "pld [%[pixels]]                \n\t"
740
        "walignr1 wr2, wr12, wr13       \n\t"
741
        "pld [%[pixels], #32]           \n\t"
742
        "walignr1 wr3, wr13, wr14       \n\t"
743
        "wmoveq wr10, wr13              \n\t"
744
        "wmoveq wr11, wr14              \n\t"
745
        "walignr2ne wr10, wr12, wr13    \n\t"
746
        "walignr2ne wr11, wr13, wr14    \n\t"
747
        "wunpckelub wr0, wr2            \n\t"
748
        "wunpckehub wr1, wr2            \n\t"
749
        "wunpckelub wr2, wr3            \n\t"
750
        "wunpckehub wr3, wr3            \n\t"
751
        "wunpckelub wr8, wr10           \n\t"
752
        "wunpckehub wr9, wr10           \n\t"
753
        "wunpckelub wr10, wr11          \n\t"
754
        "wunpckehub wr11, wr11          \n\t"
755
        "waddhus wr0, wr0, wr8          \n\t"
756
        "waddhus wr1, wr1, wr9          \n\t"
757
        "waddhus wr2, wr2, wr10         \n\t"
758
        "waddhus wr3, wr3, wr11         \n\t"
759
760
        "1:                             \n\t"
761
        // [wr0 wr1 wr2 wr3]
762
        // [wr4 wr5 wr6 wr7] <= *
763
        "wldrd wr12, [%[pixels]]        \n\t"
764
        "cmp r12, #8                    \n\t"
765
        "wldrd wr13, [%[pixels], #8]    \n\t"
766
        "wldrd wr14, [%[pixels], #16]   \n\t"
767
        "add %[pixels], %[pixels], %[line_size] \n\t"
768
        "walignr1 wr6, wr12, wr13       \n\t"
769
        "pld [%[pixels]]                \n\t"
770
        "pld [%[pixels], #32]           \n\t"
771
        "walignr1 wr7, wr13, wr14       \n\t"
772
        "wmoveq wr10, wr13              \n\t"
773
        "wmoveq wr11, wr14              \n\t"
774
        "walignr2ne wr10, wr12, wr13    \n\t"
775
        "walignr2ne wr11, wr13, wr14    \n\t"
776
        "wunpckelub wr4, wr6            \n\t"
777
        "wunpckehub wr5, wr6            \n\t"
778
        "wunpckelub wr6, wr7            \n\t"
779
        "wunpckehub wr7, wr7            \n\t"
780
        "wunpckelub wr8, wr10           \n\t"
781
        "wunpckehub wr9, wr10           \n\t"
782
        "wunpckelub wr10, wr11          \n\t"
783
        "wunpckehub wr11, wr11          \n\t"
784
        "waddhus wr4, wr4, wr8          \n\t"
785
        "waddhus wr5, wr5, wr9          \n\t"
786
        "waddhus wr6, wr6, wr10         \n\t"
787
        "waddhus wr7, wr7, wr11         \n\t"
788
        "waddhus wr8, wr0, wr4          \n\t"
789
        "waddhus wr9, wr1, wr5          \n\t"
790
        "waddhus wr10, wr2, wr6         \n\t"
791
        "waddhus wr11, wr3, wr7         \n\t"
792
        "waddhus wr8, wr8, wr15         \n\t"
793
        "waddhus wr9, wr9, wr15         \n\t"
794
        "waddhus wr10, wr10, wr15       \n\t"
795
        "waddhus wr11, wr11, wr15       \n\t"
796
        "wsrlhg wr8, wr8, wcgr0         \n\t"
797
        "wsrlhg wr9, wr9, wcgr0         \n\t"
798
        "wsrlhg wr10, wr10, wcgr0       \n\t"
799
        "wsrlhg wr11, wr11, wcgr0       \n\t"
800
        "wpackhus wr8, wr8, wr9         \n\t"
801
        "wpackhus wr9, wr10, wr11       \n\t"
802
        "wstrd wr8, [%[block]]          \n\t"
803
        "wstrd wr9, [%[block], #8]      \n\t"
804
        "add %[block], %[block], %[line_size]   \n\t"
805
806
        // [wr0 wr1 wr2 wr3] <= *
807
        // [wr4 wr5 wr6 wr7]
808
        "wldrd wr12, [%[pixels]]        \n\t"
809
        "wldrd wr13, [%[pixels], #8]    \n\t"
810
        "wldrd wr14, [%[pixels], #16]   \n\t"
811
        "add %[pixels], %[pixels], %[line_size] \n\t"
812
        "walignr1 wr2, wr12, wr13       \n\t"
813
        "pld [%[pixels]]                \n\t"
814
        "pld [%[pixels], #32]           \n\t"
815
        "walignr1 wr3, wr13, wr14       \n\t"
816
        "wmoveq wr10, wr13              \n\t"
817
        "wmoveq wr11, wr14              \n\t"
818
        "walignr2ne wr10, wr12, wr13    \n\t"
819
        "walignr2ne wr11, wr13, wr14    \n\t"
820
        "wunpckelub wr0, wr2            \n\t"
821
        "wunpckehub wr1, wr2            \n\t"
822
        "wunpckelub wr2, wr3            \n\t"
823
        "wunpckehub wr3, wr3            \n\t"
824
        "wunpckelub wr8, wr10           \n\t"
825
        "wunpckehub wr9, wr10           \n\t"
826
        "wunpckelub wr10, wr11          \n\t"
827
        "wunpckehub wr11, wr11          \n\t"
828
        "waddhus wr0, wr0, wr8          \n\t"
829
        "waddhus wr1, wr1, wr9          \n\t"
830
        "waddhus wr2, wr2, wr10         \n\t"
831
        "waddhus wr3, wr3, wr11         \n\t"
832
        "waddhus wr8, wr0, wr4          \n\t"
833
        "waddhus wr9, wr1, wr5          \n\t"
834
        "waddhus wr10, wr2, wr6         \n\t"
835
        "waddhus wr11, wr3, wr7         \n\t"
836
        "waddhus wr8, wr8, wr15         \n\t"
837
        "waddhus wr9, wr9, wr15         \n\t"
838
        "waddhus wr10, wr10, wr15       \n\t"
839
        "waddhus wr11, wr11, wr15       \n\t"
840
        "wsrlhg wr8, wr8, wcgr0         \n\t"
841
        "wsrlhg wr9, wr9, wcgr0         \n\t"
842
        "wsrlhg wr10, wr10, wcgr0       \n\t"
843
        "wsrlhg wr11, wr11, wcgr0       \n\t"
844
        "wpackhus wr8, wr8, wr9         \n\t"
845
        "wpackhus wr9, wr10, wr11       \n\t"
846
        "wstrd wr8, [%[block]]          \n\t"
847
        "wstrd wr9, [%[block], #8]      \n\t"
848
        "add %[block], %[block], %[line_size]   \n\t"
849
850
        "subs %[h], %[h], #2            \n\t"
851
        "bne 1b                         \n\t"
852
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
853
        : [line_size]"r"(line_size)
854
        : "r12", "memory");
855
}
856
857
void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
858
{
859
    // [wr0 wr1 wr2 wr3] for previous line
860
    // [wr4 wr5 wr6 wr7] for current line
861
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
862 be449fca Diego Pettenò
    __asm__ volatile(
863 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[block]]                 \n\t"
864
        "pld [%[block], #32]            \n\t"
865
        "pld [%[pixels]]                \n\t"
866
        "mov r12, #2                    \n\t"
867
        "pld [%[pixels], #32]           \n\t"
868
        "tmcr wcgr0, r12                \n\t" /* for shift value */
869
        "and r12, %[pixels], #7         \n\t"
870
        "bic %[pixels], %[pixels], #7   \n\t"
871
        "tmcr wcgr1, r12                \n\t"
872
873
        // [wr0 wr1 wr2 wr3] <= *
874
        // [wr4 wr5 wr6 wr7]
875
        "wldrd wr12, [%[pixels]]        \n\t"
876
        "add r12, r12, #1               \n\t"
877
        "wldrd wr13, [%[pixels], #8]    \n\t"
878
        "tmcr wcgr2, r12                \n\t"
879
        "add %[pixels], %[pixels], %[line_size] \n\t"
880
        "cmp r12, #8                    \n\t"
881
        "pld [%[pixels]]                \n\t"
882
        "pld [%[pixels], #32]           \n\t"
883
        "walignr1 wr2, wr12, wr13       \n\t"
884
        "wmoveq wr10, wr13              \n\t"
885
        "walignr2ne wr10, wr12, wr13    \n\t"
886
        "wunpckelub wr0, wr2            \n\t"
887
        "wunpckehub wr1, wr2            \n\t"
888
        "wunpckelub wr8, wr10           \n\t"
889
        "wunpckehub wr9, wr10           \n\t"
890
        "waddhus wr0, wr0, wr8          \n\t"
891
        "waddhus wr1, wr1, wr9          \n\t"
892
893
        "1:                             \n\t"
894
        // [wr0 wr1 wr2 wr3]
895
        // [wr4 wr5 wr6 wr7] <= *
896
        "wldrd wr12, [%[pixels]]        \n\t"
897
        "cmp r12, #8                    \n\t"
898
        "wldrd wr13, [%[pixels], #8]    \n\t"
899
        "add %[pixels], %[pixels], %[line_size] \n\t"
900
        "walignr1 wr6, wr12, wr13       \n\t"
901
        "pld [%[pixels]]                \n\t"
902
        "pld [%[pixels], #32]           \n\t"
903
        "wmoveq wr10, wr13              \n\t"
904
        "walignr2ne wr10, wr12, wr13    \n\t"
905
        "wunpckelub wr4, wr6            \n\t"
906
        "wunpckehub wr5, wr6            \n\t"
907
        "wunpckelub wr8, wr10           \n\t"
908
        "wunpckehub wr9, wr10           \n\t"
909
        "waddhus wr4, wr4, wr8          \n\t"
910
        "waddhus wr5, wr5, wr9          \n\t"
911
        "waddhus wr8, wr0, wr4          \n\t"
912
        "waddhus wr9, wr1, wr5          \n\t"
913
        "waddhus wr8, wr8, wr15         \n\t"
914
        "waddhus wr9, wr9, wr15         \n\t"
915
        "wldrd wr12, [%[block]]         \n\t"
916
        "wsrlhg wr8, wr8, wcgr0         \n\t"
917
        "wsrlhg wr9, wr9, wcgr0         \n\t"
918
        "wpackhus wr8, wr8, wr9         \n\t"
919
        WAVG2B" wr8, wr8, wr12          \n\t"
920
        "wstrd wr8, [%[block]]          \n\t"
921
        "add %[block], %[block], %[line_size]   \n\t"
922
        "wldrd wr12, [%[pixels]]        \n\t"
923
        "pld [%[block]]                 \n\t"
924
        "pld [%[block], #32]            \n\t"
925
926
        // [wr0 wr1 wr2 wr3] <= *
927
        // [wr4 wr5 wr6 wr7]
928
        "wldrd wr13, [%[pixels], #8]    \n\t"
929
        "add %[pixels], %[pixels], %[line_size] \n\t"
930
        "walignr1 wr2, wr12, wr13       \n\t"
931
        "pld [%[pixels]]                \n\t"
932
        "pld [%[pixels], #32]           \n\t"
933
        "wmoveq wr10, wr13              \n\t"
934
        "walignr2ne wr10, wr12, wr13    \n\t"
935
        "wunpckelub wr0, wr2            \n\t"
936
        "wunpckehub wr1, wr2            \n\t"
937
        "wunpckelub wr8, wr10           \n\t"
938
        "wunpckehub wr9, wr10           \n\t"
939
        "waddhus wr0, wr0, wr8          \n\t"
940
        "waddhus wr1, wr1, wr9          \n\t"
941
        "waddhus wr8, wr0, wr4          \n\t"
942
        "waddhus wr9, wr1, wr5          \n\t"
943
        "waddhus wr8, wr8, wr15         \n\t"
944
        "waddhus wr9, wr9, wr15         \n\t"
945
        "wldrd wr12, [%[block]]         \n\t"
946
        "wsrlhg wr8, wr8, wcgr0         \n\t"
947
        "wsrlhg wr9, wr9, wcgr0         \n\t"
948
        "wpackhus wr8, wr8, wr9         \n\t"
949
        "subs %[h], %[h], #2            \n\t"
950
        WAVG2B" wr8, wr8, wr12          \n\t"
951
        "wstrd wr8, [%[block]]          \n\t"
952
        "add %[block], %[block], %[line_size]   \n\t"
953
        "pld [%[block]]                 \n\t"
954
        "pld [%[block], #32]            \n\t"
955
        "bne 1b                         \n\t"
956
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
957
        : [line_size]"r"(line_size)
958
        : "r12", "memory");
959
}
960
961
void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
962
{
963
    // [wr0 wr1 wr2 wr3] for previous line
964
    // [wr4 wr5 wr6 wr7] for current line
965
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
966 be449fca Diego Pettenò
    __asm__ volatile(
967 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[block]]                 \n\t"
968
        "pld [%[block], #32]            \n\t"
969
        "pld [%[pixels]]                \n\t"
970
        "mov r12, #2                    \n\t"
971
        "pld [%[pixels], #32]           \n\t"
972
        "tmcr wcgr0, r12                \n\t" /* for shift value */
973
        /* alignment */
974
        "and r12, %[pixels], #7         \n\t"
975
        "bic %[pixels], %[pixels], #7           \n\t"
976
        "tmcr wcgr1, r12                \n\t"
977
        "add r12, r12, #1               \n\t"
978
        "tmcr wcgr2, r12                \n\t"
979
980
        // [wr0 wr1 wr2 wr3] <= *
981
        // [wr4 wr5 wr6 wr7]
982
        "wldrd wr12, [%[pixels]]        \n\t"
983
        "cmp r12, #8                    \n\t"
984
        "wldrd wr13, [%[pixels], #8]    \n\t"
985
        "wldrd wr14, [%[pixels], #16]   \n\t"
986
        "add %[pixels], %[pixels], %[line_size] \n\t"
987
        "pld [%[pixels]]                \n\t"
988
        "walignr1 wr2, wr12, wr13       \n\t"
989
        "pld [%[pixels], #32]           \n\t"
990
        "walignr1 wr3, wr13, wr14       \n\t"
991
        "wmoveq wr10, wr13              \n\t"
992
        "wmoveq wr11, wr14              \n\t"
993
        "walignr2ne wr10, wr12, wr13    \n\t"
994
        "walignr2ne wr11, wr13, wr14    \n\t"
995
        "wunpckelub wr0, wr2            \n\t"
996
        "wunpckehub wr1, wr2            \n\t"
997
        "wunpckelub wr2, wr3            \n\t"
998
        "wunpckehub wr3, wr3            \n\t"
999
        "wunpckelub wr8, wr10           \n\t"
1000
        "wunpckehub wr9, wr10           \n\t"
1001
        "wunpckelub wr10, wr11          \n\t"
1002
        "wunpckehub wr11, wr11          \n\t"
1003
        "waddhus wr0, wr0, wr8          \n\t"
1004
        "waddhus wr1, wr1, wr9          \n\t"
1005
        "waddhus wr2, wr2, wr10         \n\t"
1006
        "waddhus wr3, wr3, wr11         \n\t"
1007
1008
        "1:                             \n\t"
1009
        // [wr0 wr1 wr2 wr3]
1010
        // [wr4 wr5 wr6 wr7] <= *
1011
        "wldrd wr12, [%[pixels]]        \n\t"
1012
        "cmp r12, #8                    \n\t"
1013
        "wldrd wr13, [%[pixels], #8]    \n\t"
1014
        "wldrd wr14, [%[pixels], #16]   \n\t"
1015
        "add %[pixels], %[pixels], %[line_size] \n\t"
1016
        "walignr1 wr6, wr12, wr13       \n\t"
1017
        "pld [%[pixels]]                \n\t"
1018
        "pld [%[pixels], #32]           \n\t"
1019
        "walignr1 wr7, wr13, wr14       \n\t"
1020
        "wmoveq wr10, wr13              \n\t"
1021
        "wmoveq wr11, wr14              \n\t"
1022
        "walignr2ne wr10, wr12, wr13    \n\t"
1023
        "walignr2ne wr11, wr13, wr14    \n\t"
1024
        "wunpckelub wr4, wr6            \n\t"
1025
        "wunpckehub wr5, wr6            \n\t"
1026
        "wunpckelub wr6, wr7            \n\t"
1027
        "wunpckehub wr7, wr7            \n\t"
1028
        "wunpckelub wr8, wr10           \n\t"
1029
        "wunpckehub wr9, wr10           \n\t"
1030
        "wunpckelub wr10, wr11          \n\t"
1031
        "wunpckehub wr11, wr11          \n\t"
1032
        "waddhus wr4, wr4, wr8          \n\t"
1033
        "waddhus wr5, wr5, wr9          \n\t"
1034
        "waddhus wr6, wr6, wr10         \n\t"
1035
        "waddhus wr7, wr7, wr11         \n\t"
1036
        "waddhus wr8, wr0, wr4          \n\t"
1037
        "waddhus wr9, wr1, wr5          \n\t"
1038
        "waddhus wr10, wr2, wr6         \n\t"
1039
        "waddhus wr11, wr3, wr7         \n\t"
1040
        "waddhus wr8, wr8, wr15         \n\t"
1041
        "waddhus wr9, wr9, wr15         \n\t"
1042
        "waddhus wr10, wr10, wr15       \n\t"
1043
        "waddhus wr11, wr11, wr15       \n\t"
1044
        "wsrlhg wr8, wr8, wcgr0         \n\t"
1045
        "wsrlhg wr9, wr9, wcgr0         \n\t"
1046
        "wldrd wr12, [%[block]]         \n\t"
1047
        "wldrd wr13, [%[block], #8]     \n\t"
1048
        "wsrlhg wr10, wr10, wcgr0       \n\t"
1049
        "wsrlhg wr11, wr11, wcgr0       \n\t"
1050
        "wpackhus wr8, wr8, wr9         \n\t"
1051
        "wpackhus wr9, wr10, wr11       \n\t"
1052
        WAVG2B" wr8, wr8, wr12          \n\t"
1053
        WAVG2B" wr9, wr9, wr13          \n\t"
1054
        "wstrd wr8, [%[block]]          \n\t"
1055
        "wstrd wr9, [%[block], #8]      \n\t"
1056
        "add %[block], %[block], %[line_size]   \n\t"
1057
1058
        // [wr0 wr1 wr2 wr3] <= *
1059
        // [wr4 wr5 wr6 wr7]
1060
        "wldrd wr12, [%[pixels]]        \n\t"
1061
        "pld [%[block]]                 \n\t"
1062
        "wldrd wr13, [%[pixels], #8]    \n\t"
1063
        "pld [%[block], #32]            \n\t"
1064
        "wldrd wr14, [%[pixels], #16]   \n\t"
1065
        "add %[pixels], %[pixels], %[line_size] \n\t"
1066
        "walignr1 wr2, wr12, wr13       \n\t"
1067
        "pld [%[pixels]]                \n\t"
1068
        "pld [%[pixels], #32]           \n\t"
1069
        "walignr1 wr3, wr13, wr14       \n\t"
1070
        "wmoveq wr10, wr13              \n\t"
1071
        "wmoveq wr11, wr14              \n\t"
1072
        "walignr2ne wr10, wr12, wr13    \n\t"
1073
        "walignr2ne wr11, wr13, wr14    \n\t"
1074
        "wunpckelub wr0, wr2            \n\t"
1075
        "wunpckehub wr1, wr2            \n\t"
1076
        "wunpckelub wr2, wr3            \n\t"
1077
        "wunpckehub wr3, wr3            \n\t"
1078
        "wunpckelub wr8, wr10           \n\t"
1079
        "wunpckehub wr9, wr10           \n\t"
1080
        "wunpckelub wr10, wr11          \n\t"
1081
        "wunpckehub wr11, wr11          \n\t"
1082
        "waddhus wr0, wr0, wr8          \n\t"
1083
        "waddhus wr1, wr1, wr9          \n\t"
1084
        "waddhus wr2, wr2, wr10         \n\t"
1085
        "waddhus wr3, wr3, wr11         \n\t"
1086
        "waddhus wr8, wr0, wr4          \n\t"
1087
        "waddhus wr9, wr1, wr5          \n\t"
1088
        "waddhus wr10, wr2, wr6         \n\t"
1089
        "waddhus wr11, wr3, wr7         \n\t"
1090
        "waddhus wr8, wr8, wr15         \n\t"
1091
        "waddhus wr9, wr9, wr15         \n\t"
1092
        "waddhus wr10, wr10, wr15       \n\t"
1093
        "waddhus wr11, wr11, wr15       \n\t"
1094
        "wsrlhg wr8, wr8, wcgr0         \n\t"
1095
        "wsrlhg wr9, wr9, wcgr0         \n\t"
1096
        "wldrd wr12, [%[block]]         \n\t"
1097
        "wldrd wr13, [%[block], #8]     \n\t"
1098
        "wsrlhg wr10, wr10, wcgr0       \n\t"
1099
        "wsrlhg wr11, wr11, wcgr0       \n\t"
1100
        "wpackhus wr8, wr8, wr9         \n\t"
1101
        "wpackhus wr9, wr10, wr11       \n\t"
1102
        WAVG2B" wr8, wr8, wr12          \n\t"
1103
        WAVG2B" wr9, wr9, wr13          \n\t"
1104
        "wstrd wr8, [%[block]]          \n\t"
1105
        "wstrd wr9, [%[block], #8]      \n\t"
1106
        "add %[block], %[block], %[line_size]   \n\t"
1107
        "subs %[h], %[h], #2            \n\t"
1108
        "pld [%[block]]                 \n\t"
1109
        "pld [%[block], #32]            \n\t"
1110
        "bne 1b                         \n\t"
1111
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
1112
        : [line_size]"r"(line_size)
1113
        : "r12", "memory");
1114
}