Statistics
| Branch: | Revision:

ffmpeg / libavcodec / armv4l / dsputil_iwmmxt_rnd.h @ be449fca

History | View | Annotate | Download (47.2 KB)

1 04d7f601 Diego Biurrun
/*
2
 * iWMMXt optimized DSP utils
3
 * copyright (c) 2004 AGAWA Koji
4
 *
5 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8 04d7f601 Diego Biurrun
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
11 04d7f601 Diego Biurrun
 *
12 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
13 04d7f601 Diego Biurrun
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
19 04d7f601 Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22 22170ec0 Diego Biurrun
/* This header intentionally has no multiple inclusion guards. It is meant to
23
 * be included multiple times and generates different code depending on the
24
 * value of certain #defines. */
25 31b2c144 Diego Biurrun
26 6ad1fa5a Bernhard Rosenkränzer
void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
27
{
28
    int stride = line_size;
29 be449fca Diego Pettenò
    __asm__ volatile (
30 6ad1fa5a Bernhard Rosenkränzer
        "and r12, %[pixels], #7 \n\t"
31
        "bic %[pixels], %[pixels], #7 \n\t"
32
        "tmcr wcgr1, r12 \n\t"
33
        "add r4, %[pixels], %[line_size] \n\t"
34
        "add r5, %[block], %[line_size] \n\t"
35
        "mov %[line_size], %[line_size], lsl #1 \n\t"
36
        "1: \n\t"
37
        "wldrd wr0, [%[pixels]] \n\t"
38
        "subs %[h], %[h], #2 \n\t"
39
        "wldrd wr1, [%[pixels], #8] \n\t"
40
        "add %[pixels], %[pixels], %[line_size] \n\t"
41
        "wldrd wr3, [r4] \n\t"
42
        "pld [%[pixels]] \n\t"
43
        "pld [%[pixels], #32] \n\t"
44
        "wldrd wr4, [r4, #8] \n\t"
45
        "add r4, r4, %[line_size] \n\t"
46
        "walignr1 wr8, wr0, wr1 \n\t"
47
        "pld [r4] \n\t"
48
        "pld [r4, #32] \n\t"
49
        "walignr1 wr10, wr3, wr4 \n\t"
50
        "wstrd wr8, [%[block]] \n\t"
51
        "add %[block], %[block], %[line_size] \n\t"
52
        "wstrd wr10, [r5] \n\t"
53
        "add r5, r5, %[line_size] \n\t"
54
        "bne 1b \n\t"
55
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
56
        :
57
        : "memory", "r4", "r5", "r12");
58
}
59
60
void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
61
{
62
    int stride = line_size;
63 be449fca Diego Pettenò
    __asm__ volatile (
64 6ad1fa5a Bernhard Rosenkränzer
        "and r12, %[pixels], #7 \n\t"
65
        "bic %[pixels], %[pixels], #7 \n\t"
66
        "tmcr wcgr1, r12 \n\t"
67
        "add r4, %[pixels], %[line_size] \n\t"
68
        "add r5, %[block], %[line_size] \n\t"
69
        "mov %[line_size], %[line_size], lsl #1 \n\t"
70
        "1: \n\t"
71
        "wldrd wr0, [%[pixels]] \n\t"
72
        "subs %[h], %[h], #2 \n\t"
73
        "wldrd wr1, [%[pixels], #8] \n\t"
74
        "add %[pixels], %[pixels], %[line_size] \n\t"
75
        "wldrd wr3, [r4] \n\t"
76
        "pld [%[pixels]] \n\t"
77
        "pld [%[pixels], #32] \n\t"
78
        "wldrd wr4, [r4, #8] \n\t"
79
        "add r4, r4, %[line_size] \n\t"
80
        "walignr1 wr8, wr0, wr1 \n\t"
81
        "wldrd wr0, [%[block]] \n\t"
82
        "wldrd wr2, [r5] \n\t"
83
        "pld [r4] \n\t"
84
        "pld [r4, #32] \n\t"
85
        "walignr1 wr10, wr3, wr4 \n\t"
86
        WAVG2B" wr8, wr8, wr0 \n\t"
87
        WAVG2B" wr10, wr10, wr2 \n\t"
88
        "wstrd wr8, [%[block]] \n\t"
89
        "add %[block], %[block], %[line_size] \n\t"
90
        "wstrd wr10, [r5] \n\t"
91
        "pld [%[block]] \n\t"
92
        "pld [%[block], #32] \n\t"
93
        "add r5, r5, %[line_size] \n\t"
94
        "pld [r5] \n\t"
95
        "pld [r5, #32] \n\t"
96
        "bne 1b \n\t"
97
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
98
        :
99
        : "memory", "r4", "r5", "r12");
100
}
101
102
void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
103
{
104
    int stride = line_size;
105 be449fca Diego Pettenò
    __asm__ volatile (
106 6ad1fa5a Bernhard Rosenkränzer
        "and r12, %[pixels], #7 \n\t"
107
        "bic %[pixels], %[pixels], #7 \n\t"
108
        "tmcr wcgr1, r12 \n\t"
109
        "add r4, %[pixels], %[line_size] \n\t"
110
        "add r5, %[block], %[line_size] \n\t"
111
        "mov %[line_size], %[line_size], lsl #1 \n\t"
112
        "1: \n\t"
113
        "wldrd wr0, [%[pixels]] \n\t"
114
        "wldrd wr1, [%[pixels], #8] \n\t"
115
        "subs %[h], %[h], #2 \n\t"
116
        "wldrd wr2, [%[pixels], #16] \n\t"
117
        "add %[pixels], %[pixels], %[line_size] \n\t"
118
        "wldrd wr3, [r4] \n\t"
119
        "pld [%[pixels]] \n\t"
120
        "pld [%[pixels], #32] \n\t"
121
        "walignr1 wr8, wr0, wr1 \n\t"
122
        "wldrd wr4, [r4, #8] \n\t"
123
        "walignr1 wr9, wr1, wr2 \n\t"
124
        "wldrd wr5, [r4, #16] \n\t"
125
        "add r4, r4, %[line_size] \n\t"
126
        "pld [r4] \n\t"
127
        "pld [r4, #32] \n\t"
128
        "walignr1 wr10, wr3, wr4 \n\t"
129
        "wstrd wr8, [%[block]] \n\t"
130
        "walignr1 wr11, wr4, wr5 \n\t"
131
        "wstrd wr9, [%[block], #8] \n\t"
132
        "add %[block], %[block], %[line_size] \n\t"
133
        "wstrd wr10, [r5] \n\t"
134
        "wstrd wr11, [r5, #8] \n\t"
135
        "add r5, r5, %[line_size] \n\t"
136
        "bne 1b \n\t"
137
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
138
        :
139
        : "memory", "r4", "r5", "r12");
140
}
141
142
void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
143
{
144
    int stride = line_size;
145 be449fca Diego Pettenò
    __asm__ volatile (
146 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
147
        "pld [%[pixels], #32]           \n\t"
148
        "pld [%[block]]                 \n\t"
149
        "pld [%[block], #32]            \n\t"
150
        "and r12, %[pixels], #7         \n\t"
151
        "bic %[pixels], %[pixels], #7   \n\t"
152
        "tmcr wcgr1, r12                \n\t"
153
        "add r4, %[pixels], %[line_size]\n\t"
154
        "add r5, %[block], %[line_size] \n\t"
155
        "mov %[line_size], %[line_size], lsl #1 \n\t"
156
        "1:                             \n\t"
157
        "wldrd wr0, [%[pixels]]         \n\t"
158
        "wldrd wr1, [%[pixels], #8]     \n\t"
159
        "subs %[h], %[h], #2            \n\t"
160
        "wldrd wr2, [%[pixels], #16]    \n\t"
161
        "add %[pixels], %[pixels], %[line_size] \n\t"
162
        "wldrd wr3, [r4]                \n\t"
163
        "pld [%[pixels]]                \n\t"
164
        "pld [%[pixels], #32]           \n\t"
165
        "walignr1 wr8, wr0, wr1         \n\t"
166
        "wldrd wr4, [r4, #8]            \n\t"
167
        "walignr1 wr9, wr1, wr2         \n\t"
168
        "wldrd wr5, [r4, #16]           \n\t"
169
        "add r4, r4, %[line_size]       \n\t"
170
        "wldrd wr0, [%[block]]          \n\t"
171
        "pld [r4]                       \n\t"
172
        "wldrd wr1, [%[block], #8]      \n\t"
173
        "pld [r4, #32]                  \n\t"
174
        "wldrd wr2, [r5]                \n\t"
175
        "walignr1 wr10, wr3, wr4        \n\t"
176
        "wldrd wr3, [r5, #8]            \n\t"
177
        WAVG2B" wr8, wr8, wr0           \n\t"
178
        WAVG2B" wr9, wr9, wr1           \n\t"
179
        WAVG2B" wr10, wr10, wr2         \n\t"
180
        "wstrd wr8, [%[block]]          \n\t"
181
        "walignr1 wr11, wr4, wr5        \n\t"
182
        WAVG2B" wr11, wr11, wr3         \n\t"
183
        "wstrd wr9, [%[block], #8]      \n\t"
184
        "add %[block], %[block], %[line_size] \n\t"
185
        "wstrd wr10, [r5]               \n\t"
186
        "pld [%[block]]                 \n\t"
187
        "pld [%[block], #32]            \n\t"
188
        "wstrd wr11, [r5, #8]           \n\t"
189
        "add r5, r5, %[line_size]       \n\t"
190
        "pld [r5]                       \n\t"
191
        "pld [r5, #32]                  \n\t"
192
        "bne 1b \n\t"
193
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
194
        :
195
        : "memory", "r4", "r5", "r12");
196
}
197
198
void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
199
{
200
    int stride = line_size;
201
    // [wr0 wr1 wr2 wr3] for previous line
202
    // [wr4 wr5 wr6 wr7] for current line
203
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
204 be449fca Diego Pettenò
    __asm__ volatile(
205 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
206
        "pld [%[pixels], #32]           \n\t"
207
        "and r12, %[pixels], #7         \n\t"
208
        "bic %[pixels], %[pixels], #7   \n\t"
209
        "tmcr wcgr1, r12                \n\t"
210
        "add r12, r12, #1               \n\t"
211
        "add r4, %[pixels], %[line_size]\n\t"
212
        "tmcr wcgr2, r12                \n\t"
213
        "add r5, %[block], %[line_size] \n\t"
214
        "mov %[line_size], %[line_size], lsl #1 \n\t"
215
216
        "1:                             \n\t"
217
        "wldrd wr10, [%[pixels]]        \n\t"
218
        "cmp r12, #8                    \n\t"
219
        "wldrd wr11, [%[pixels], #8]    \n\t"
220
        "add %[pixels], %[pixels], %[line_size] \n\t"
221
        "wldrd wr13, [r4]               \n\t"
222
        "pld [%[pixels]]                \n\t"
223
        "wldrd wr14, [r4, #8]           \n\t"
224
        "pld [%[pixels], #32]           \n\t"
225
        "add r4, r4, %[line_size]       \n\t"
226
        "walignr1 wr0, wr10, wr11       \n\t"
227
        "pld [r4]                       \n\t"
228
        "pld [r4, #32]                  \n\t"
229
        "walignr1 wr2, wr13, wr14       \n\t"
230
        "wmoveq wr4, wr11               \n\t"
231
        "wmoveq wr6, wr14               \n\t"
232
        "walignr2ne wr4, wr10, wr11     \n\t"
233
        "walignr2ne wr6, wr13, wr14     \n\t"
234
        WAVG2B" wr0, wr0, wr4           \n\t"
235
        WAVG2B" wr2, wr2, wr6           \n\t"
236
        "wstrd wr0, [%[block]]          \n\t"
237
        "subs %[h], %[h], #2            \n\t"
238
        "wstrd wr2, [r5]                \n\t"
239
        "add %[block], %[block], %[line_size]   \n\t"
240
        "add r5, r5, %[line_size]       \n\t"
241
        "bne 1b                         \n\t"
242
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
243
        :
244
        : "r4", "r5", "r12", "memory");
245
}
246
247
void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
248
{
249
    int stride = line_size;
250
    // [wr0 wr1 wr2 wr3] for previous line
251
    // [wr4 wr5 wr6 wr7] for current line
252
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
253 be449fca Diego Pettenò
    __asm__ volatile(
254 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
255
        "pld [%[pixels], #32]           \n\t"
256
        "and r12, %[pixels], #7         \n\t"
257
        "bic %[pixels], %[pixels], #7   \n\t"
258
        "tmcr wcgr1, r12                \n\t"
259
        "add r12, r12, #1               \n\t"
260
        "add r4, %[pixels], %[line_size]\n\t"
261
        "tmcr wcgr2, r12                \n\t"
262
        "add r5, %[block], %[line_size] \n\t"
263
        "mov %[line_size], %[line_size], lsl #1 \n\t"
264
265
        "1:                             \n\t"
266
        "wldrd wr10, [%[pixels]]        \n\t"
267
        "cmp r12, #8                    \n\t"
268
        "wldrd wr11, [%[pixels], #8]    \n\t"
269
        "wldrd wr12, [%[pixels], #16]   \n\t"
270
        "add %[pixels], %[pixels], %[line_size] \n\t"
271
        "wldrd wr13, [r4]               \n\t"
272
        "pld [%[pixels]]                \n\t"
273
        "wldrd wr14, [r4, #8]           \n\t"
274
        "pld [%[pixels], #32]           \n\t"
275
        "wldrd wr15, [r4, #16]          \n\t"
276
        "add r4, r4, %[line_size]       \n\t"
277
        "walignr1 wr0, wr10, wr11       \n\t"
278
        "pld [r4]                       \n\t"
279
        "pld [r4, #32]                  \n\t"
280
        "walignr1 wr1, wr11, wr12       \n\t"
281
        "walignr1 wr2, wr13, wr14       \n\t"
282
        "walignr1 wr3, wr14, wr15       \n\t"
283
        "wmoveq wr4, wr11               \n\t"
284
        "wmoveq wr5, wr12               \n\t"
285
        "wmoveq wr6, wr14               \n\t"
286
        "wmoveq wr7, wr15               \n\t"
287
        "walignr2ne wr4, wr10, wr11     \n\t"
288
        "walignr2ne wr5, wr11, wr12     \n\t"
289
        "walignr2ne wr6, wr13, wr14     \n\t"
290
        "walignr2ne wr7, wr14, wr15     \n\t"
291
        WAVG2B" wr0, wr0, wr4           \n\t"
292
        WAVG2B" wr1, wr1, wr5           \n\t"
293
        "wstrd wr0, [%[block]]          \n\t"
294
        WAVG2B" wr2, wr2, wr6           \n\t"
295
        "wstrd wr1, [%[block], #8]      \n\t"
296
        WAVG2B" wr3, wr3, wr7           \n\t"
297
        "add %[block], %[block], %[line_size]   \n\t"
298
        "wstrd wr2, [r5]                \n\t"
299
        "subs %[h], %[h], #2            \n\t"
300
        "wstrd wr3, [r5, #8]            \n\t"
301
        "add r5, r5, %[line_size]       \n\t"
302
        "bne 1b                         \n\t"
303
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
304
        :
305
        : "r4", "r5", "r12", "memory");
306
}
307
308
void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
309
{
310
    int stride = line_size;
311
    // [wr0 wr1 wr2 wr3] for previous line
312
    // [wr4 wr5 wr6 wr7] for current line
313
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
314 be449fca Diego Pettenò
    __asm__ volatile(
315 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
316
        "pld [%[pixels], #32]           \n\t"
317
        "pld [%[block]]                 \n\t"
318
        "pld [%[block], #32]            \n\t"
319
        "and r12, %[pixels], #7         \n\t"
320
        "bic %[pixels], %[pixels], #7   \n\t"
321
        "tmcr wcgr1, r12                \n\t"
322
        "add r12, r12, #1               \n\t"
323
        "add r4, %[pixels], %[line_size]\n\t"
324
        "tmcr wcgr2, r12                \n\t"
325
        "add r5, %[block], %[line_size] \n\t"
326
        "mov %[line_size], %[line_size], lsl #1 \n\t"
327
        "pld [r5]                       \n\t"
328
        "pld [r5, #32]                  \n\t"
329
330
        "1:                             \n\t"
331
        "wldrd wr10, [%[pixels]]        \n\t"
332
        "cmp r12, #8                    \n\t"
333
        "wldrd wr11, [%[pixels], #8]    \n\t"
334
        "add %[pixels], %[pixels], %[line_size] \n\t"
335
        "wldrd wr13, [r4]               \n\t"
336
        "pld [%[pixels]]                \n\t"
337
        "wldrd wr14, [r4, #8]           \n\t"
338
        "pld [%[pixels], #32]           \n\t"
339
        "add r4, r4, %[line_size]       \n\t"
340
        "walignr1 wr0, wr10, wr11       \n\t"
341
        "pld [r4]                       \n\t"
342
        "pld [r4, #32]                  \n\t"
343
        "walignr1 wr2, wr13, wr14       \n\t"
344
        "wmoveq wr4, wr11               \n\t"
345
        "wmoveq wr6, wr14               \n\t"
346
        "walignr2ne wr4, wr10, wr11     \n\t"
347
        "wldrd wr10, [%[block]]         \n\t"
348
        "walignr2ne wr6, wr13, wr14     \n\t"
349
        "wldrd wr12, [r5]               \n\t"
350
        WAVG2B" wr0, wr0, wr4           \n\t"
351
        WAVG2B" wr2, wr2, wr6           \n\t"
352
        WAVG2B" wr0, wr0, wr10          \n\t"
353
        WAVG2B" wr2, wr2, wr12          \n\t"
354
        "wstrd wr0, [%[block]]          \n\t"
355
        "subs %[h], %[h], #2            \n\t"
356
        "wstrd wr2, [r5]                \n\t"
357
        "add %[block], %[block], %[line_size]   \n\t"
358
        "add r5, r5, %[line_size]       \n\t"
359
        "pld [%[block]]                 \n\t"
360
        "pld [%[block], #32]            \n\t"
361
        "pld [r5]                       \n\t"
362
        "pld [r5, #32]                  \n\t"
363
        "bne 1b                         \n\t"
364
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
365
        :
366
        : "r4", "r5", "r12", "memory");
367
}
368
369
void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
370
{
371
    int stride = line_size;
372
    // [wr0 wr1 wr2 wr3] for previous line
373
    // [wr4 wr5 wr6 wr7] for current line
374
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
375 be449fca Diego Pettenò
    __asm__ volatile(
376 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
377
        "pld [%[pixels], #32]           \n\t"
378
        "pld [%[block]]                 \n\t"
379
        "pld [%[block], #32]            \n\t"
380
        "and r12, %[pixels], #7         \n\t"
381
        "bic %[pixels], %[pixels], #7   \n\t"
382
        "tmcr wcgr1, r12                \n\t"
383
        "add r12, r12, #1               \n\t"
384
        "add r4, %[pixels], %[line_size]\n\t"
385
        "tmcr wcgr2, r12                \n\t"
386
        "add r5, %[block], %[line_size] \n\t"
387
        "mov %[line_size], %[line_size], lsl #1 \n\t"
388
        "pld [r5]                       \n\t"
389
        "pld [r5, #32]                  \n\t"
390
391
        "1:                             \n\t"
392
        "wldrd wr10, [%[pixels]]        \n\t"
393
        "cmp r12, #8                    \n\t"
394
        "wldrd wr11, [%[pixels], #8]    \n\t"
395
        "wldrd wr12, [%[pixels], #16]   \n\t"
396
        "add %[pixels], %[pixels], %[line_size] \n\t"
397
        "wldrd wr13, [r4]               \n\t"
398
        "pld [%[pixels]]                \n\t"
399
        "wldrd wr14, [r4, #8]           \n\t"
400
        "pld [%[pixels], #32]           \n\t"
401
        "wldrd wr15, [r4, #16]          \n\t"
402
        "add r4, r4, %[line_size]       \n\t"
403
        "walignr1 wr0, wr10, wr11       \n\t"
404
        "pld [r4]                       \n\t"
405
        "pld [r4, #32]                  \n\t"
406
        "walignr1 wr1, wr11, wr12       \n\t"
407
        "walignr1 wr2, wr13, wr14       \n\t"
408
        "walignr1 wr3, wr14, wr15       \n\t"
409
        "wmoveq wr4, wr11               \n\t"
410
        "wmoveq wr5, wr12               \n\t"
411
        "wmoveq wr6, wr14               \n\t"
412
        "wmoveq wr7, wr15               \n\t"
413
        "walignr2ne wr4, wr10, wr11     \n\t"
414
        "walignr2ne wr5, wr11, wr12     \n\t"
415
        "walignr2ne wr6, wr13, wr14     \n\t"
416
        "walignr2ne wr7, wr14, wr15     \n\t"
417
        "wldrd wr10, [%[block]]         \n\t"
418
        WAVG2B" wr0, wr0, wr4           \n\t"
419
        "wldrd wr11, [%[block], #8]     \n\t"
420
        WAVG2B" wr1, wr1, wr5           \n\t"
421
        "wldrd wr12, [r5]               \n\t"
422
        WAVG2B" wr2, wr2, wr6           \n\t"
423
        "wldrd wr13, [r5, #8]           \n\t"
424
        WAVG2B" wr3, wr3, wr7           \n\t"
425
        WAVG2B" wr0, wr0, wr10          \n\t"
426
        WAVG2B" wr1, wr1, wr11          \n\t"
427
        WAVG2B" wr2, wr2, wr12          \n\t"
428
        WAVG2B" wr3, wr3, wr13          \n\t"
429
        "wstrd wr0, [%[block]]          \n\t"
430
        "subs %[h], %[h], #2            \n\t"
431
        "wstrd wr1, [%[block], #8]      \n\t"
432
        "add %[block], %[block], %[line_size]   \n\t"
433
        "wstrd wr2, [r5]                \n\t"
434
        "pld [%[block]]                 \n\t"
435
        "wstrd wr3, [r5, #8]            \n\t"
436
        "add r5, r5, %[line_size]       \n\t"
437
        "pld [%[block], #32]            \n\t"
438
        "pld [r5]                       \n\t"
439
        "pld [r5, #32]                  \n\t"
440
        "bne 1b                         \n\t"
441
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
442
        :
443
        :"r4", "r5", "r12", "memory");
444
}
445
446
void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
447
{
448
    int stride = line_size;
449
    // [wr0 wr1 wr2 wr3] for previous line
450
    // [wr4 wr5 wr6 wr7] for current line
451 be449fca Diego Pettenò
    __asm__ volatile(
452 6ad1fa5a Bernhard Rosenkränzer
        "pld            [%[pixels]]                             \n\t"
453
        "pld            [%[pixels], #32]                        \n\t"
454
        "and            r12, %[pixels], #7                      \n\t"
455
        "tmcr           wcgr1, r12                              \n\t"
456
        "bic            %[pixels], %[pixels], #7                \n\t"
457
458
        "wldrd          wr10, [%[pixels]]                       \n\t"
459
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
460
        "pld            [%[block]]                              \n\t"
461
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
462
        "walignr1       wr0, wr10, wr11                         \n\t"
463
        "pld            [%[pixels]]                             \n\t"
464
        "pld            [%[pixels], #32]                        \n\t"
465
466
      "1:                                                       \n\t"
467
        "wldrd          wr10, [%[pixels]]                       \n\t"
468
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
469
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
470
        "pld            [%[pixels]]                             \n\t"
471
        "pld            [%[pixels], #32]                        \n\t"
472
        "walignr1       wr4, wr10, wr11                         \n\t"
473
        "wldrd          wr10, [%[block]]                        \n\t"
474
         WAVG2B"        wr8, wr0, wr4                           \n\t"
475
         WAVG2B"        wr8, wr8, wr10                          \n\t"
476
        "wstrd          wr8, [%[block]]                         \n\t"
477
        "add            %[block], %[block], %[line_size]        \n\t"
478
479
        "wldrd          wr10, [%[pixels]]                       \n\t"
480
        "wldrd          wr11, [%[pixels], #8]                   \n\t"
481
        "pld            [%[block]]                              \n\t"
482
        "add            %[pixels], %[pixels], %[line_size]      \n\t"
483
        "pld            [%[pixels]]                             \n\t"
484
        "pld            [%[pixels], #32]                        \n\t"
485
        "walignr1       wr0, wr10, wr11                         \n\t"
486
        "wldrd          wr10, [%[block]]                        \n\t"
487
         WAVG2B"        wr8, wr0, wr4                           \n\t"
488
         WAVG2B"        wr8, wr8, wr10                          \n\t"
489
        "wstrd          wr8, [%[block]]                         \n\t"
490
        "add            %[block], %[block], %[line_size]        \n\t"
491
492
        "subs           %[h], %[h], #2                          \n\t"
493
        "pld            [%[block]]                              \n\t"
494
        "bne            1b                                      \n\t"
495
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
496
        :
497
        : "cc", "memory", "r12");
498
}
499
500
void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
501
{
502
    int stride = line_size;
503
    // [wr0 wr1 wr2 wr3] for previous line
504
    // [wr4 wr5 wr6 wr7] for current line
505 be449fca Diego Pettenò
    __asm__ volatile(
506 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
507
        "pld [%[pixels], #32]           \n\t"
508
        "and r12, %[pixels], #7         \n\t"
509
        "tmcr wcgr1, r12                \n\t"
510
        "bic %[pixels], %[pixels], #7   \n\t"
511
512
        "wldrd wr10, [%[pixels]]        \n\t"
513
        "wldrd wr11, [%[pixels], #8]    \n\t"
514
        "wldrd wr12, [%[pixels], #16]   \n\t"
515
        "add %[pixels], %[pixels], %[line_size] \n\t"
516
        "pld [%[pixels]]                \n\t"
517
        "pld [%[pixels], #32]           \n\t"
518
        "walignr1 wr0, wr10, wr11       \n\t"
519
        "walignr1 wr1, wr11, wr12       \n\t"
520
521
        "1:                             \n\t"
522
        "wldrd wr10, [%[pixels]]        \n\t"
523
        "wldrd wr11, [%[pixels], #8]    \n\t"
524
        "wldrd wr12, [%[pixels], #16]   \n\t"
525
        "add %[pixels], %[pixels], %[line_size] \n\t"
526
        "pld [%[pixels]]                \n\t"
527
        "pld [%[pixels], #32]           \n\t"
528
        "walignr1 wr4, wr10, wr11       \n\t"
529
        "walignr1 wr5, wr11, wr12       \n\t"
530
        WAVG2B" wr8, wr0, wr4           \n\t"
531
        WAVG2B" wr9, wr1, wr5           \n\t"
532
        "wstrd wr8, [%[block]]          \n\t"
533
        "wstrd wr9, [%[block], #8]      \n\t"
534
        "add %[block], %[block], %[line_size]   \n\t"
535
536
        "wldrd wr10, [%[pixels]]        \n\t"
537
        "wldrd wr11, [%[pixels], #8]    \n\t"
538
        "wldrd wr12, [%[pixels], #16]   \n\t"
539
        "add %[pixels], %[pixels], %[line_size] \n\t"
540
        "pld [%[pixels]]                \n\t"
541
        "pld [%[pixels], #32]           \n\t"
542
        "walignr1 wr0, wr10, wr11       \n\t"
543
        "walignr1 wr1, wr11, wr12       \n\t"
544
        WAVG2B" wr8, wr0, wr4           \n\t"
545
        WAVG2B" wr9, wr1, wr5           \n\t"
546
        "wstrd wr8, [%[block]]          \n\t"
547
        "wstrd wr9, [%[block], #8]      \n\t"
548
        "add %[block], %[block], %[line_size]   \n\t"
549
550
        "subs %[h], %[h], #2            \n\t"
551
        "bne 1b                         \n\t"
552
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
553
        :
554
        : "r4", "r5", "r12", "memory");
555
}
556
557
void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
558
{
559
    int stride = line_size;
560
    // [wr0 wr1 wr2 wr3] for previous line
561
    // [wr4 wr5 wr6 wr7] for current line
562 be449fca Diego Pettenò
    __asm__ volatile(
563 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
564
        "pld [%[pixels], #32]           \n\t"
565
        "and r12, %[pixels], #7         \n\t"
566
        "tmcr wcgr1, r12                \n\t"
567
        "bic %[pixels], %[pixels], #7   \n\t"
568
569
        "wldrd wr10, [%[pixels]]        \n\t"
570
        "wldrd wr11, [%[pixels], #8]    \n\t"
571
        "pld [%[block]]                 \n\t"
572
        "wldrd wr12, [%[pixels], #16]   \n\t"
573
        "add %[pixels], %[pixels], %[line_size] \n\t"
574
        "pld [%[pixels]]                \n\t"
575
        "pld [%[pixels], #32]           \n\t"
576
        "walignr1 wr0, wr10, wr11       \n\t"
577
        "walignr1 wr1, wr11, wr12       \n\t"
578
579
        "1:                             \n\t"
580
        "wldrd wr10, [%[pixels]]        \n\t"
581
        "wldrd wr11, [%[pixels], #8]    \n\t"
582
        "wldrd wr12, [%[pixels], #16]   \n\t"
583
        "add %[pixels], %[pixels], %[line_size] \n\t"
584
        "pld [%[pixels]]                \n\t"
585
        "pld [%[pixels], #32]           \n\t"
586
        "walignr1 wr4, wr10, wr11       \n\t"
587
        "walignr1 wr5, wr11, wr12       \n\t"
588
        "wldrd wr10, [%[block]]         \n\t"
589
        "wldrd wr11, [%[block], #8]     \n\t"
590
        WAVG2B" wr8, wr0, wr4           \n\t"
591
        WAVG2B" wr9, wr1, wr5           \n\t"
592
        WAVG2B" wr8, wr8, wr10          \n\t"
593
        WAVG2B" wr9, wr9, wr11          \n\t"
594
        "wstrd wr8, [%[block]]          \n\t"
595
        "wstrd wr9, [%[block], #8]      \n\t"
596
        "add %[block], %[block], %[line_size]   \n\t"
597
598
        "wldrd wr10, [%[pixels]]        \n\t"
599
        "wldrd wr11, [%[pixels], #8]    \n\t"
600
        "pld [%[block]]                 \n\t"
601
        "wldrd wr12, [%[pixels], #16]   \n\t"
602
        "add %[pixels], %[pixels], %[line_size] \n\t"
603
        "pld [%[pixels]]                \n\t"
604
        "pld [%[pixels], #32]           \n\t"
605
        "walignr1 wr0, wr10, wr11       \n\t"
606
        "walignr1 wr1, wr11, wr12       \n\t"
607
        "wldrd wr10, [%[block]]         \n\t"
608
        "wldrd wr11, [%[block], #8]     \n\t"
609
        WAVG2B" wr8, wr0, wr4           \n\t"
610
        WAVG2B" wr9, wr1, wr5           \n\t"
611
        WAVG2B" wr8, wr8, wr10          \n\t"
612
        WAVG2B" wr9, wr9, wr11          \n\t"
613
        "wstrd wr8, [%[block]]          \n\t"
614
        "wstrd wr9, [%[block], #8]      \n\t"
615
        "add %[block], %[block], %[line_size]   \n\t"
616
617
        "subs %[h], %[h], #2            \n\t"
618
        "pld [%[block]]                 \n\t"
619
        "bne 1b                         \n\t"
620
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
621
        :
622
        : "r4", "r5", "r12", "memory");
623
}
624
625
void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
626
{
627
    // [wr0 wr1 wr2 wr3] for previous line
628
    // [wr4 wr5 wr6 wr7] for current line
629
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
630 be449fca Diego Pettenò
    __asm__ volatile(
631 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
632
        "mov r12, #2                    \n\t"
633
        "pld [%[pixels], #32]           \n\t"
634
        "tmcr wcgr0, r12                \n\t" /* for shift value */
635
        "and r12, %[pixels], #7         \n\t"
636
        "bic %[pixels], %[pixels], #7   \n\t"
637
        "tmcr wcgr1, r12                \n\t"
638
639
        // [wr0 wr1 wr2 wr3] <= *
640
        // [wr4 wr5 wr6 wr7]
641
        "wldrd wr12, [%[pixels]]        \n\t"
642
        "add r12, r12, #1               \n\t"
643
        "wldrd wr13, [%[pixels], #8]    \n\t"
644
        "tmcr wcgr2, r12                \n\t"
645
        "add %[pixels], %[pixels], %[line_size] \n\t"
646
        "cmp r12, #8                    \n\t"
647
        "pld [%[pixels]]                \n\t"
648
        "pld [%[pixels], #32]           \n\t"
649
        "walignr1 wr2, wr12, wr13       \n\t"
650
        "wmoveq wr10, wr13              \n\t"
651
        "walignr2ne wr10, wr12, wr13    \n\t"
652
        "wunpckelub wr0, wr2            \n\t"
653
        "wunpckehub wr1, wr2            \n\t"
654
        "wunpckelub wr8, wr10           \n\t"
655
        "wunpckehub wr9, wr10           \n\t"
656
        "waddhus wr0, wr0, wr8          \n\t"
657
        "waddhus wr1, wr1, wr9          \n\t"
658
659
        "1:                             \n\t"
660
        // [wr0 wr1 wr2 wr3]
661
        // [wr4 wr5 wr6 wr7] <= *
662
        "wldrd wr12, [%[pixels]]        \n\t"
663
        "cmp r12, #8                    \n\t"
664
        "wldrd wr13, [%[pixels], #8]    \n\t"
665
        "add %[pixels], %[pixels], %[line_size] \n\t"
666
        "walignr1 wr6, wr12, wr13       \n\t"
667
        "pld [%[pixels]]                \n\t"
668
        "pld [%[pixels], #32]           \n\t"
669
        "wmoveq wr10, wr13              \n\t"
670
        "walignr2ne wr10, wr12, wr13    \n\t"
671
        "wunpckelub wr4, wr6            \n\t"
672
        "wunpckehub wr5, wr6            \n\t"
673
        "wunpckelub wr8, wr10           \n\t"
674
        "wunpckehub wr9, wr10           \n\t"
675
        "waddhus wr4, wr4, wr8          \n\t"
676
        "waddhus wr5, wr5, wr9          \n\t"
677
        "waddhus wr8, wr0, wr4          \n\t"
678
        "waddhus wr9, wr1, wr5          \n\t"
679
        "waddhus wr8, wr8, wr15         \n\t"
680
        "waddhus wr9, wr9, wr15         \n\t"
681
        "wsrlhg wr8, wr8, wcgr0         \n\t"
682
        "wsrlhg wr9, wr9, wcgr0         \n\t"
683
        "wpackhus wr8, wr8, wr9         \n\t"
684
        "wstrd wr8, [%[block]]          \n\t"
685
        "add %[block], %[block], %[line_size]   \n\t"
686
687
        // [wr0 wr1 wr2 wr3] <= *
688
        // [wr4 wr5 wr6 wr7]
689
        "wldrd wr12, [%[pixels]]        \n\t"
690
        "wldrd wr13, [%[pixels], #8]    \n\t"
691
        "add %[pixels], %[pixels], %[line_size] \n\t"
692
        "walignr1 wr2, wr12, wr13       \n\t"
693
        "pld [%[pixels]]                \n\t"
694
        "pld [%[pixels], #32]           \n\t"
695
        "wmoveq wr10, wr13              \n\t"
696
        "walignr2ne wr10, wr12, wr13    \n\t"
697
        "wunpckelub wr0, wr2            \n\t"
698
        "wunpckehub wr1, wr2            \n\t"
699
        "wunpckelub wr8, wr10           \n\t"
700
        "wunpckehub wr9, wr10           \n\t"
701
        "waddhus wr0, wr0, wr8          \n\t"
702
        "waddhus wr1, wr1, wr9          \n\t"
703
        "waddhus wr8, wr0, wr4          \n\t"
704
        "waddhus wr9, wr1, wr5          \n\t"
705
        "waddhus wr8, wr8, wr15         \n\t"
706
        "waddhus wr9, wr9, wr15         \n\t"
707
        "wsrlhg wr8, wr8, wcgr0         \n\t"
708
        "wsrlhg wr9, wr9, wcgr0         \n\t"
709
        "wpackhus wr8, wr8, wr9         \n\t"
710
        "subs %[h], %[h], #2            \n\t"
711
        "wstrd wr8, [%[block]]          \n\t"
712
        "add %[block], %[block], %[line_size]   \n\t"
713
        "bne 1b                         \n\t"
714
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
715
        : [line_size]"r"(line_size)
716
        : "r12", "memory");
717
}
718
719
void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
720
{
721
    // [wr0 wr1 wr2 wr3] for previous line
722
    // [wr4 wr5 wr6 wr7] for current line
723
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
724 be449fca Diego Pettenò
    __asm__ volatile(
725 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[pixels]]                \n\t"
726
        "mov r12, #2                    \n\t"
727
        "pld [%[pixels], #32]           \n\t"
728
        "tmcr wcgr0, r12                \n\t" /* for shift value */
729
        /* alignment */
730
        "and r12, %[pixels], #7         \n\t"
731
        "bic %[pixels], %[pixels], #7   \n\t"
732
        "tmcr wcgr1, r12                \n\t"
733
        "add r12, r12, #1               \n\t"
734
        "tmcr wcgr2, r12                \n\t"
735
736
        // [wr0 wr1 wr2 wr3] <= *
737
        // [wr4 wr5 wr6 wr7]
738
        "wldrd wr12, [%[pixels]]        \n\t"
739
        "cmp r12, #8                    \n\t"
740
        "wldrd wr13, [%[pixels], #8]    \n\t"
741
        "wldrd wr14, [%[pixels], #16]   \n\t"
742
        "add %[pixels], %[pixels], %[line_size] \n\t"
743
        "pld [%[pixels]]                \n\t"
744
        "walignr1 wr2, wr12, wr13       \n\t"
745
        "pld [%[pixels], #32]           \n\t"
746
        "walignr1 wr3, wr13, wr14       \n\t"
747
        "wmoveq wr10, wr13              \n\t"
748
        "wmoveq wr11, wr14              \n\t"
749
        "walignr2ne wr10, wr12, wr13    \n\t"
750
        "walignr2ne wr11, wr13, wr14    \n\t"
751
        "wunpckelub wr0, wr2            \n\t"
752
        "wunpckehub wr1, wr2            \n\t"
753
        "wunpckelub wr2, wr3            \n\t"
754
        "wunpckehub wr3, wr3            \n\t"
755
        "wunpckelub wr8, wr10           \n\t"
756
        "wunpckehub wr9, wr10           \n\t"
757
        "wunpckelub wr10, wr11          \n\t"
758
        "wunpckehub wr11, wr11          \n\t"
759
        "waddhus wr0, wr0, wr8          \n\t"
760
        "waddhus wr1, wr1, wr9          \n\t"
761
        "waddhus wr2, wr2, wr10         \n\t"
762
        "waddhus wr3, wr3, wr11         \n\t"
763
764
        "1:                             \n\t"
765
        // [wr0 wr1 wr2 wr3]
766
        // [wr4 wr5 wr6 wr7] <= *
767
        "wldrd wr12, [%[pixels]]        \n\t"
768
        "cmp r12, #8                    \n\t"
769
        "wldrd wr13, [%[pixels], #8]    \n\t"
770
        "wldrd wr14, [%[pixels], #16]   \n\t"
771
        "add %[pixels], %[pixels], %[line_size] \n\t"
772
        "walignr1 wr6, wr12, wr13       \n\t"
773
        "pld [%[pixels]]                \n\t"
774
        "pld [%[pixels], #32]           \n\t"
775
        "walignr1 wr7, wr13, wr14       \n\t"
776
        "wmoveq wr10, wr13              \n\t"
777
        "wmoveq wr11, wr14              \n\t"
778
        "walignr2ne wr10, wr12, wr13    \n\t"
779
        "walignr2ne wr11, wr13, wr14    \n\t"
780
        "wunpckelub wr4, wr6            \n\t"
781
        "wunpckehub wr5, wr6            \n\t"
782
        "wunpckelub wr6, wr7            \n\t"
783
        "wunpckehub wr7, wr7            \n\t"
784
        "wunpckelub wr8, wr10           \n\t"
785
        "wunpckehub wr9, wr10           \n\t"
786
        "wunpckelub wr10, wr11          \n\t"
787
        "wunpckehub wr11, wr11          \n\t"
788
        "waddhus wr4, wr4, wr8          \n\t"
789
        "waddhus wr5, wr5, wr9          \n\t"
790
        "waddhus wr6, wr6, wr10         \n\t"
791
        "waddhus wr7, wr7, wr11         \n\t"
792
        "waddhus wr8, wr0, wr4          \n\t"
793
        "waddhus wr9, wr1, wr5          \n\t"
794
        "waddhus wr10, wr2, wr6         \n\t"
795
        "waddhus wr11, wr3, wr7         \n\t"
796
        "waddhus wr8, wr8, wr15         \n\t"
797
        "waddhus wr9, wr9, wr15         \n\t"
798
        "waddhus wr10, wr10, wr15       \n\t"
799
        "waddhus wr11, wr11, wr15       \n\t"
800
        "wsrlhg wr8, wr8, wcgr0         \n\t"
801
        "wsrlhg wr9, wr9, wcgr0         \n\t"
802
        "wsrlhg wr10, wr10, wcgr0       \n\t"
803
        "wsrlhg wr11, wr11, wcgr0       \n\t"
804
        "wpackhus wr8, wr8, wr9         \n\t"
805
        "wpackhus wr9, wr10, wr11       \n\t"
806
        "wstrd wr8, [%[block]]          \n\t"
807
        "wstrd wr9, [%[block], #8]      \n\t"
808
        "add %[block], %[block], %[line_size]   \n\t"
809
810
        // [wr0 wr1 wr2 wr3] <= *
811
        // [wr4 wr5 wr6 wr7]
812
        "wldrd wr12, [%[pixels]]        \n\t"
813
        "wldrd wr13, [%[pixels], #8]    \n\t"
814
        "wldrd wr14, [%[pixels], #16]   \n\t"
815
        "add %[pixels], %[pixels], %[line_size] \n\t"
816
        "walignr1 wr2, wr12, wr13       \n\t"
817
        "pld [%[pixels]]                \n\t"
818
        "pld [%[pixels], #32]           \n\t"
819
        "walignr1 wr3, wr13, wr14       \n\t"
820
        "wmoveq wr10, wr13              \n\t"
821
        "wmoveq wr11, wr14              \n\t"
822
        "walignr2ne wr10, wr12, wr13    \n\t"
823
        "walignr2ne wr11, wr13, wr14    \n\t"
824
        "wunpckelub wr0, wr2            \n\t"
825
        "wunpckehub wr1, wr2            \n\t"
826
        "wunpckelub wr2, wr3            \n\t"
827
        "wunpckehub wr3, wr3            \n\t"
828
        "wunpckelub wr8, wr10           \n\t"
829
        "wunpckehub wr9, wr10           \n\t"
830
        "wunpckelub wr10, wr11          \n\t"
831
        "wunpckehub wr11, wr11          \n\t"
832
        "waddhus wr0, wr0, wr8          \n\t"
833
        "waddhus wr1, wr1, wr9          \n\t"
834
        "waddhus wr2, wr2, wr10         \n\t"
835
        "waddhus wr3, wr3, wr11         \n\t"
836
        "waddhus wr8, wr0, wr4          \n\t"
837
        "waddhus wr9, wr1, wr5          \n\t"
838
        "waddhus wr10, wr2, wr6         \n\t"
839
        "waddhus wr11, wr3, wr7         \n\t"
840
        "waddhus wr8, wr8, wr15         \n\t"
841
        "waddhus wr9, wr9, wr15         \n\t"
842
        "waddhus wr10, wr10, wr15       \n\t"
843
        "waddhus wr11, wr11, wr15       \n\t"
844
        "wsrlhg wr8, wr8, wcgr0         \n\t"
845
        "wsrlhg wr9, wr9, wcgr0         \n\t"
846
        "wsrlhg wr10, wr10, wcgr0       \n\t"
847
        "wsrlhg wr11, wr11, wcgr0       \n\t"
848
        "wpackhus wr8, wr8, wr9         \n\t"
849
        "wpackhus wr9, wr10, wr11       \n\t"
850
        "wstrd wr8, [%[block]]          \n\t"
851
        "wstrd wr9, [%[block], #8]      \n\t"
852
        "add %[block], %[block], %[line_size]   \n\t"
853
854
        "subs %[h], %[h], #2            \n\t"
855
        "bne 1b                         \n\t"
856
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
857
        : [line_size]"r"(line_size)
858
        : "r12", "memory");
859
}
860
861
void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
862
{
863
    // [wr0 wr1 wr2 wr3] for previous line
864
    // [wr4 wr5 wr6 wr7] for current line
865
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
866 be449fca Diego Pettenò
    __asm__ volatile(
867 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[block]]                 \n\t"
868
        "pld [%[block], #32]            \n\t"
869
        "pld [%[pixels]]                \n\t"
870
        "mov r12, #2                    \n\t"
871
        "pld [%[pixels], #32]           \n\t"
872
        "tmcr wcgr0, r12                \n\t" /* for shift value */
873
        "and r12, %[pixels], #7         \n\t"
874
        "bic %[pixels], %[pixels], #7   \n\t"
875
        "tmcr wcgr1, r12                \n\t"
876
877
        // [wr0 wr1 wr2 wr3] <= *
878
        // [wr4 wr5 wr6 wr7]
879
        "wldrd wr12, [%[pixels]]        \n\t"
880
        "add r12, r12, #1               \n\t"
881
        "wldrd wr13, [%[pixels], #8]    \n\t"
882
        "tmcr wcgr2, r12                \n\t"
883
        "add %[pixels], %[pixels], %[line_size] \n\t"
884
        "cmp r12, #8                    \n\t"
885
        "pld [%[pixels]]                \n\t"
886
        "pld [%[pixels], #32]           \n\t"
887
        "walignr1 wr2, wr12, wr13       \n\t"
888
        "wmoveq wr10, wr13              \n\t"
889
        "walignr2ne wr10, wr12, wr13    \n\t"
890
        "wunpckelub wr0, wr2            \n\t"
891
        "wunpckehub wr1, wr2            \n\t"
892
        "wunpckelub wr8, wr10           \n\t"
893
        "wunpckehub wr9, wr10           \n\t"
894
        "waddhus wr0, wr0, wr8          \n\t"
895
        "waddhus wr1, wr1, wr9          \n\t"
896
897
        "1:                             \n\t"
898
        // [wr0 wr1 wr2 wr3]
899
        // [wr4 wr5 wr6 wr7] <= *
900
        "wldrd wr12, [%[pixels]]        \n\t"
901
        "cmp r12, #8                    \n\t"
902
        "wldrd wr13, [%[pixels], #8]    \n\t"
903
        "add %[pixels], %[pixels], %[line_size] \n\t"
904
        "walignr1 wr6, wr12, wr13       \n\t"
905
        "pld [%[pixels]]                \n\t"
906
        "pld [%[pixels], #32]           \n\t"
907
        "wmoveq wr10, wr13              \n\t"
908
        "walignr2ne wr10, wr12, wr13    \n\t"
909
        "wunpckelub wr4, wr6            \n\t"
910
        "wunpckehub wr5, wr6            \n\t"
911
        "wunpckelub wr8, wr10           \n\t"
912
        "wunpckehub wr9, wr10           \n\t"
913
        "waddhus wr4, wr4, wr8          \n\t"
914
        "waddhus wr5, wr5, wr9          \n\t"
915
        "waddhus wr8, wr0, wr4          \n\t"
916
        "waddhus wr9, wr1, wr5          \n\t"
917
        "waddhus wr8, wr8, wr15         \n\t"
918
        "waddhus wr9, wr9, wr15         \n\t"
919
        "wldrd wr12, [%[block]]         \n\t"
920
        "wsrlhg wr8, wr8, wcgr0         \n\t"
921
        "wsrlhg wr9, wr9, wcgr0         \n\t"
922
        "wpackhus wr8, wr8, wr9         \n\t"
923
        WAVG2B" wr8, wr8, wr12          \n\t"
924
        "wstrd wr8, [%[block]]          \n\t"
925
        "add %[block], %[block], %[line_size]   \n\t"
926
        "wldrd wr12, [%[pixels]]        \n\t"
927
        "pld [%[block]]                 \n\t"
928
        "pld [%[block], #32]            \n\t"
929
930
        // [wr0 wr1 wr2 wr3] <= *
931
        // [wr4 wr5 wr6 wr7]
932
        "wldrd wr13, [%[pixels], #8]    \n\t"
933
        "add %[pixels], %[pixels], %[line_size] \n\t"
934
        "walignr1 wr2, wr12, wr13       \n\t"
935
        "pld [%[pixels]]                \n\t"
936
        "pld [%[pixels], #32]           \n\t"
937
        "wmoveq wr10, wr13              \n\t"
938
        "walignr2ne wr10, wr12, wr13    \n\t"
939
        "wunpckelub wr0, wr2            \n\t"
940
        "wunpckehub wr1, wr2            \n\t"
941
        "wunpckelub wr8, wr10           \n\t"
942
        "wunpckehub wr9, wr10           \n\t"
943
        "waddhus wr0, wr0, wr8          \n\t"
944
        "waddhus wr1, wr1, wr9          \n\t"
945
        "waddhus wr8, wr0, wr4          \n\t"
946
        "waddhus wr9, wr1, wr5          \n\t"
947
        "waddhus wr8, wr8, wr15         \n\t"
948
        "waddhus wr9, wr9, wr15         \n\t"
949
        "wldrd wr12, [%[block]]         \n\t"
950
        "wsrlhg wr8, wr8, wcgr0         \n\t"
951
        "wsrlhg wr9, wr9, wcgr0         \n\t"
952
        "wpackhus wr8, wr8, wr9         \n\t"
953
        "subs %[h], %[h], #2            \n\t"
954
        WAVG2B" wr8, wr8, wr12          \n\t"
955
        "wstrd wr8, [%[block]]          \n\t"
956
        "add %[block], %[block], %[line_size]   \n\t"
957
        "pld [%[block]]                 \n\t"
958
        "pld [%[block], #32]            \n\t"
959
        "bne 1b                         \n\t"
960
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
961
        : [line_size]"r"(line_size)
962
        : "r12", "memory");
963
}
964
965
void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
966
{
967
    // [wr0 wr1 wr2 wr3] for previous line
968
    // [wr4 wr5 wr6 wr7] for current line
969
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
970 be449fca Diego Pettenò
    __asm__ volatile(
971 6ad1fa5a Bernhard Rosenkränzer
        "pld [%[block]]                 \n\t"
972
        "pld [%[block], #32]            \n\t"
973
        "pld [%[pixels]]                \n\t"
974
        "mov r12, #2                    \n\t"
975
        "pld [%[pixels], #32]           \n\t"
976
        "tmcr wcgr0, r12                \n\t" /* for shift value */
977
        /* alignment */
978
        "and r12, %[pixels], #7         \n\t"
979
        "bic %[pixels], %[pixels], #7           \n\t"
980
        "tmcr wcgr1, r12                \n\t"
981
        "add r12, r12, #1               \n\t"
982
        "tmcr wcgr2, r12                \n\t"
983
984
        // [wr0 wr1 wr2 wr3] <= *
985
        // [wr4 wr5 wr6 wr7]
986
        "wldrd wr12, [%[pixels]]        \n\t"
987
        "cmp r12, #8                    \n\t"
988
        "wldrd wr13, [%[pixels], #8]    \n\t"
989
        "wldrd wr14, [%[pixels], #16]   \n\t"
990
        "add %[pixels], %[pixels], %[line_size] \n\t"
991
        "pld [%[pixels]]                \n\t"
992
        "walignr1 wr2, wr12, wr13       \n\t"
993
        "pld [%[pixels], #32]           \n\t"
994
        "walignr1 wr3, wr13, wr14       \n\t"
995
        "wmoveq wr10, wr13              \n\t"
996
        "wmoveq wr11, wr14              \n\t"
997
        "walignr2ne wr10, wr12, wr13    \n\t"
998
        "walignr2ne wr11, wr13, wr14    \n\t"
999
        "wunpckelub wr0, wr2            \n\t"
1000
        "wunpckehub wr1, wr2            \n\t"
1001
        "wunpckelub wr2, wr3            \n\t"
1002
        "wunpckehub wr3, wr3            \n\t"
1003
        "wunpckelub wr8, wr10           \n\t"
1004
        "wunpckehub wr9, wr10           \n\t"
1005
        "wunpckelub wr10, wr11          \n\t"
1006
        "wunpckehub wr11, wr11          \n\t"
1007
        "waddhus wr0, wr0, wr8          \n\t"
1008
        "waddhus wr1, wr1, wr9          \n\t"
1009
        "waddhus wr2, wr2, wr10         \n\t"
1010
        "waddhus wr3, wr3, wr11         \n\t"
1011
1012
        "1:                             \n\t"
1013
        // [wr0 wr1 wr2 wr3]
1014
        // [wr4 wr5 wr6 wr7] <= *
1015
        "wldrd wr12, [%[pixels]]        \n\t"
1016
        "cmp r12, #8                    \n\t"
1017
        "wldrd wr13, [%[pixels], #8]    \n\t"
1018
        "wldrd wr14, [%[pixels], #16]   \n\t"
1019
        "add %[pixels], %[pixels], %[line_size] \n\t"
1020
        "walignr1 wr6, wr12, wr13       \n\t"
1021
        "pld [%[pixels]]                \n\t"
1022
        "pld [%[pixels], #32]           \n\t"
1023
        "walignr1 wr7, wr13, wr14       \n\t"
1024
        "wmoveq wr10, wr13              \n\t"
1025
        "wmoveq wr11, wr14              \n\t"
1026
        "walignr2ne wr10, wr12, wr13    \n\t"
1027
        "walignr2ne wr11, wr13, wr14    \n\t"
1028
        "wunpckelub wr4, wr6            \n\t"
1029
        "wunpckehub wr5, wr6            \n\t"
1030
        "wunpckelub wr6, wr7            \n\t"
1031
        "wunpckehub wr7, wr7            \n\t"
1032
        "wunpckelub wr8, wr10           \n\t"
1033
        "wunpckehub wr9, wr10           \n\t"
1034
        "wunpckelub wr10, wr11          \n\t"
1035
        "wunpckehub wr11, wr11          \n\t"
1036
        "waddhus wr4, wr4, wr8          \n\t"
1037
        "waddhus wr5, wr5, wr9          \n\t"
1038
        "waddhus wr6, wr6, wr10         \n\t"
1039
        "waddhus wr7, wr7, wr11         \n\t"
1040
        "waddhus wr8, wr0, wr4          \n\t"
1041
        "waddhus wr9, wr1, wr5          \n\t"
1042
        "waddhus wr10, wr2, wr6         \n\t"
1043
        "waddhus wr11, wr3, wr7         \n\t"
1044
        "waddhus wr8, wr8, wr15         \n\t"
1045
        "waddhus wr9, wr9, wr15         \n\t"
1046
        "waddhus wr10, wr10, wr15       \n\t"
1047
        "waddhus wr11, wr11, wr15       \n\t"
1048
        "wsrlhg wr8, wr8, wcgr0         \n\t"
1049
        "wsrlhg wr9, wr9, wcgr0         \n\t"
1050
        "wldrd wr12, [%[block]]         \n\t"
1051
        "wldrd wr13, [%[block], #8]     \n\t"
1052
        "wsrlhg wr10, wr10, wcgr0       \n\t"
1053
        "wsrlhg wr11, wr11, wcgr0       \n\t"
1054
        "wpackhus wr8, wr8, wr9         \n\t"
1055
        "wpackhus wr9, wr10, wr11       \n\t"
1056
        WAVG2B" wr8, wr8, wr12          \n\t"
1057
        WAVG2B" wr9, wr9, wr13          \n\t"
1058
        "wstrd wr8, [%[block]]          \n\t"
1059
        "wstrd wr9, [%[block], #8]      \n\t"
1060
        "add %[block], %[block], %[line_size]   \n\t"
1061
1062
        // [wr0 wr1 wr2 wr3] <= *
1063
        // [wr4 wr5 wr6 wr7]
1064
        "wldrd wr12, [%[pixels]]        \n\t"
1065
        "pld [%[block]]                 \n\t"
1066
        "wldrd wr13, [%[pixels], #8]    \n\t"
1067
        "pld [%[block], #32]            \n\t"
1068
        "wldrd wr14, [%[pixels], #16]   \n\t"
1069
        "add %[pixels], %[pixels], %[line_size] \n\t"
1070
        "walignr1 wr2, wr12, wr13       \n\t"
1071
        "pld [%[pixels]]                \n\t"
1072
        "pld [%[pixels], #32]           \n\t"
1073
        "walignr1 wr3, wr13, wr14       \n\t"
1074
        "wmoveq wr10, wr13              \n\t"
1075
        "wmoveq wr11, wr14              \n\t"
1076
        "walignr2ne wr10, wr12, wr13    \n\t"
1077
        "walignr2ne wr11, wr13, wr14    \n\t"
1078
        "wunpckelub wr0, wr2            \n\t"
1079
        "wunpckehub wr1, wr2            \n\t"
1080
        "wunpckelub wr2, wr3            \n\t"
1081
        "wunpckehub wr3, wr3            \n\t"
1082
        "wunpckelub wr8, wr10           \n\t"
1083
        "wunpckehub wr9, wr10           \n\t"
1084
        "wunpckelub wr10, wr11          \n\t"
1085
        "wunpckehub wr11, wr11          \n\t"
1086
        "waddhus wr0, wr0, wr8          \n\t"
1087
        "waddhus wr1, wr1, wr9          \n\t"
1088
        "waddhus wr2, wr2, wr10         \n\t"
1089
        "waddhus wr3, wr3, wr11         \n\t"
1090
        "waddhus wr8, wr0, wr4          \n\t"
1091
        "waddhus wr9, wr1, wr5          \n\t"
1092
        "waddhus wr10, wr2, wr6         \n\t"
1093
        "waddhus wr11, wr3, wr7         \n\t"
1094
        "waddhus wr8, wr8, wr15         \n\t"
1095
        "waddhus wr9, wr9, wr15         \n\t"
1096
        "waddhus wr10, wr10, wr15       \n\t"
1097
        "waddhus wr11, wr11, wr15       \n\t"
1098
        "wsrlhg wr8, wr8, wcgr0         \n\t"
1099
        "wsrlhg wr9, wr9, wcgr0         \n\t"
1100
        "wldrd wr12, [%[block]]         \n\t"
1101
        "wldrd wr13, [%[block], #8]     \n\t"
1102
        "wsrlhg wr10, wr10, wcgr0       \n\t"
1103
        "wsrlhg wr11, wr11, wcgr0       \n\t"
1104
        "wpackhus wr8, wr8, wr9         \n\t"
1105
        "wpackhus wr9, wr10, wr11       \n\t"
1106
        WAVG2B" wr8, wr8, wr12          \n\t"
1107
        WAVG2B" wr9, wr9, wr13          \n\t"
1108
        "wstrd wr8, [%[block]]          \n\t"
1109
        "wstrd wr9, [%[block], #8]      \n\t"
1110
        "add %[block], %[block], %[line_size]   \n\t"
1111
        "subs %[h], %[h], #2            \n\t"
1112
        "pld [%[block]]                 \n\t"
1113
        "pld [%[block], #32]            \n\t"
1114
        "bne 1b                         \n\t"
1115
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
1116
        : [line_size]"r"(line_size)
1117
        : "r12", "memory");
1118
}