Revision 6ad1fa5a

View differences:

libavcodec/Makefile
316 316

  
317 317
# armv4l specific stuff
318 318
ifeq ($(TARGET_ARCH_ARMV4L),yes)
319
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o
319
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o
320 320
OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o
321
ifeq ($(TARGET_IWMMXT),yes)
322
OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o
323
endif
321 324
endif
322 325

  
323 326
# sun mediaLib specific stuff
......
327 330
CFLAGS += $(MLIB_INC)
328 331
endif
329 332

  
333
# Intel IPP specific stuff
334
# currently only works when libavcodec is used in mplayer
335
ifeq ($(HAVE_IPP),yes)
336
CFLAGS += $(IPP_INC)
337
endif
338

  
330 339
# alpha specific stuff
331 340
ifeq ($(TARGET_ARCH_ALPHA),yes)
332 341
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \
libavcodec/armv4l/dsputil_arm.c
18 18
 */
19 19

  
20 20
#include "../dsputil.h"
21
#ifdef HAVE_IPP
22
#include "ipp.h"
23
#endif
24

  
25
#ifdef HAVE_IWMMXT
26
extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
27
#endif
21 28

  
22 29
extern void j_rev_dct_ARM(DCTELEM *data);
23 30
extern void simple_idct_ARM(DCTELEM *data);
......
26 33
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
27 34
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
28 35

  
36
void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
37
void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
38
void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
39
void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
40

  
41
void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
42
void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
43
void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
44

  
45
void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
46
static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
47
{
48
    put_pixels8_x2_arm(block, pixels, line_size, h);
49
    put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
50
}
51

  
52
static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
53
{
54
    put_pixels8_y2_arm(block, pixels, line_size, h);
55
    put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
56
}
57

  
58
static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
59
{
60
    put_pixels8_xy2_arm(block, pixels, line_size, h);
61
    put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
62
}
63

  
64
static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
65
{
66
    put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h);
67
    put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
68
}
69

  
70
static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
71
{
72
    put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h);
73
    put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
74
}
75

  
76
static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
77
{
78
    put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h);
79
    put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
80
}
81

  
82
static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size)
83
{
84
    asm volatile (
85
                  "mov r10, #8 \n\t"
86

  
87
                  "1: \n\t"
88

  
89
                  /* load dest */
90
                  "ldr r4, [%1] \n\t"
91
                  /* block[0] and block[1]*/
92
                  "ldrsh r5, [%0] \n\t"
93
                  "ldrsh r7, [%0, #2] \n\t"
94
                  "and r6, r4, #0xFF \n\t"
95
                  "and r8, r4, #0xFF00 \n\t"
96
                  "add r6, r5, r6 \n\t"
97
                  "add r8, r7, r8, lsr #8 \n\t"
98
                  "mvn r5, r5 \n\t"
99
                  "mvn r7, r7 \n\t"
100
                  "tst r6, #0x100 \n\t"
101
                  "movne r6, r5, lsr #24 \n\t"
102
                  "tst r8, #0x100 \n\t"
103
                  "movne r8, r7, lsr #24 \n\t"
104
                  "mov r9, r6 \n\t"
105
                  "ldrsh r5, [%0, #4] \n\t" /* moved form [A] */
106
                  "orr r9, r9, r8, lsl #8 \n\t"
107
                  /* block[2] and block[3] */
108
                  /* [A] */
109
                  "ldrsh r7, [%0, #6] \n\t"
110
                  "and r6, r4, #0xFF0000 \n\t"
111
                  "and r8, r4, #0xFF000000 \n\t"
112
                  "add r6, r5, r6, lsr #16 \n\t"
113
                  "add r8, r7, r8, lsr #24 \n\t"
114
                  "mvn r5, r5 \n\t"
115
                  "mvn r7, r7 \n\t"
116
                  "tst r6, #0x100 \n\t"
117
                  "movne r6, r5, lsr #24 \n\t"
118
                  "tst r8, #0x100 \n\t"
119
                  "movne r8, r7, lsr #24 \n\t"
120
                  "orr r9, r9, r6, lsl #16 \n\t"
121
                  "ldr r4, [%1, #4] \n\t"       /* moved form [B] */
122
                  "orr r9, r9, r8, lsl #24 \n\t"
123
                  /* store dest */
124
                  "ldrsh r5, [%0, #8] \n\t" /* moved form [C] */
125
                  "str r9, [%1] \n\t"
126

  
127
                  /* load dest */
128
                  /* [B] */
129
                  /* block[4] and block[5] */
130
                  /* [C] */
131
                  "ldrsh r7, [%0, #10] \n\t"
132
                  "and r6, r4, #0xFF \n\t"
133
                  "and r8, r4, #0xFF00 \n\t"
134
                  "add r6, r5, r6 \n\t"
135
                  "add r8, r7, r8, lsr #8 \n\t"
136
                  "mvn r5, r5 \n\t"
137
                  "mvn r7, r7 \n\t"
138
                  "tst r6, #0x100 \n\t"
139
                  "movne r6, r5, lsr #24 \n\t"
140
                  "tst r8, #0x100 \n\t"
141
                  "movne r8, r7, lsr #24 \n\t"
142
                  "mov r9, r6 \n\t"
143
                  "ldrsh r5, [%0, #12] \n\t" /* moved from [D] */
144
                  "orr r9, r9, r8, lsl #8 \n\t"
145
                  /* block[6] and block[7] */
146
                  /* [D] */
147
                  "ldrsh r7, [%0, #14] \n\t"
148
                  "and r6, r4, #0xFF0000 \n\t"
149
                  "and r8, r4, #0xFF000000 \n\t"
150
                  "add r6, r5, r6, lsr #16 \n\t"
151
                  "add r8, r7, r8, lsr #24 \n\t"
152
                  "mvn r5, r5 \n\t"
153
                  "mvn r7, r7 \n\t"
154
                  "tst r6, #0x100 \n\t"
155
                  "movne r6, r5, lsr #24 \n\t"
156
                  "tst r8, #0x100 \n\t"
157
                  "movne r8, r7, lsr #24 \n\t"
158
                  "orr r9, r9, r6, lsl #16 \n\t"
159
                  "add %0, %0, #16 \n\t" /* moved from [E] */
160
                  "orr r9, r9, r8, lsl #24 \n\t"
161
                  "subs r10, r10, #1 \n\t" /* moved from [F] */
162
                  /* store dest */
163
                  "str r9, [%1, #4] \n\t"
164

  
165
                  /* [E] */
166
                  /* [F] */
167
                  "add %1, %1, %2 \n\t"
168
                  "bne 1b \n\t"
169
                  :
170
                  : "r"(block),
171
                    "r"(dest),
172
                    "r"(line_size)
173
                  : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" );
174
}
175

  
29 176
/* XXX: those functions should be suppressed ASAP when all IDCTs are
30 177
   converted */
31 178
static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
......
48 195
    simple_idct_ARM (block);
49 196
    ff_add_pixels_clamped(block, dest, line_size);
50 197
}
198
static void simple_idct_ipp(DCTELEM *block)
199
{
200
#ifdef HAVE_IPP
201
    ippiDCT8x8Inv_Video_16s_C1I(block);
202
#endif
203
}
204
static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
205
{
206
#ifdef HAVE_IPP
207
    ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
208
#endif
209
}
210

  
211
#ifdef HAVE_IWMMXT
212
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
213
#endif
214

  
215
static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
216
{
217
#ifdef HAVE_IPP
218
    ippiDCT8x8Inv_Video_16s_C1I(block);
219
#ifdef HAVE_IWMMXT
220
    add_pixels_clamped_iwmmxt(block, dest, line_size);
221
#else
222
    add_pixels_clamped_ARM(block, dest, line_size);
223
#endif
224
#endif
225
}
51 226

  
52 227
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
53 228
{
......
56 231
    ff_put_pixels_clamped = c->put_pixels_clamped;
57 232
    ff_add_pixels_clamped = c->add_pixels_clamped;
58 233

  
234
#ifdef HAVE_IPP
235
    if(idct_algo==FF_IDCT_ARM){
236
#else
59 237
    if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){
238
#endif
60 239
        c->idct_put= j_rev_dct_ARM_put;
61 240
        c->idct_add= j_rev_dct_ARM_add;
62 241
	c->idct    = j_rev_dct_ARM;
......
66 245
	c->idct_add= simple_idct_ARM_add;
67 246
	c->idct    = simple_idct_ARM;
68 247
	c->idct_permutation_type= FF_NO_IDCT_PERM;
248
#ifdef HAVE_IPP
249
    } else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){
250
#else
251
    } else if (idct_algo==FF_IDCT_IPP){
252
#endif
253
        c->idct_put= simple_idct_ipp_put;
254
        c->idct_add= simple_idct_ipp_add;
255
        c->idct    = simple_idct_ipp;
256
        c->idct_permutation_type= FF_NO_IDCT_PERM;
69 257
    }
258

  
259
/*     c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG!
260
    c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
261
    c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
262
/*     c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */
263
/*     c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(??????) */
264
    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
265
    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
266
/*     c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */
267
    c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
268
    c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
269
/*     c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */
270
/*     c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */
271
    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
272
    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
273
    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
274
/*     c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */
275

  
276
#if 1
277
#ifdef HAVE_IWMMXT
278
    dsputil_init_iwmmxt(c, avctx);
279
#endif
280
#endif
70 281
}
libavcodec/armv4l/dsputil_arm_s.S
1
@
2
@ ARMv4L optimized DSP utils
3
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
4
@
5
@ This library is free software; you can redistribute it and/or
6
@ modify it under the terms of the GNU Lesser General Public
7
@ License as published by the Free Software Foundation; either
8
@ version 2 of the License, or (at your option) any later version.
9
@
10
@ This library is distributed in the hope that it will be useful,
11
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
@ Lesser General Public License for more details.
14
@
15
@ You should have received a copy of the GNU Lesser General Public
16
@ License along with this library; if not, write to the Free Software
17
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
@
19

  
20
.macro  ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
21
        mov \Rd0, \Rn0, lsr #(\shift * 8)
22
        mov \Rd1, \Rn1, lsr #(\shift * 8)
23
        mov \Rd2, \Rn2, lsr #(\shift * 8)
24
        mov \Rd3, \Rn3, lsr #(\shift * 8)
25
        orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
26
        orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
27
        orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
28
        orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
29
.endm
30
.macro  ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
31
        mov \R0, \R0, lsr #(\shift * 8)
32
        orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
33
        mov \R1, \R1, lsr #(\shift * 8)
34
        orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
35
.endm
36
.macro  ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
37
        mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
38
        mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
39
        orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
40
        orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
41
.endm
42

  
43
.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
44
        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
45
        @ Rmask = 0xFEFEFEFE
46
        @ Rn = destroy
47
        eor \Rd0, \Rn0, \Rm0
48
        eor \Rd1, \Rn1, \Rm1
49
        orr \Rn0, \Rn0, \Rm0
50
        orr \Rn1, \Rn1, \Rm1
51
        and \Rd0, \Rd0, \Rmask
52
        and \Rd1, \Rd1, \Rmask
53
        sub \Rd0, \Rn0, \Rd0, lsr #1
54
        sub \Rd1, \Rn1, \Rd1, lsr #1
55
.endm
56

  
57
.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
58
        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
59
        @ Rmask = 0xFEFEFEFE
60
        @ Rn = destroy
61
        eor \Rd0, \Rn0, \Rm0
62
        eor \Rd1, \Rn1, \Rm1
63
        and \Rn0, \Rn0, \Rm0
64
        and \Rn1, \Rn1, \Rm1
65
        and \Rd0, \Rd0, \Rmask
66
        and \Rd1, \Rd1, \Rmask
67
        add \Rd0, \Rn0, \Rd0, lsr #1
68
        add \Rd1, \Rn1, \Rd1, lsr #1
69
.endm
70

  
71
@ ----------------------------------------------------------------
72
        .align 8
73
        .global put_pixels16_arm
74
put_pixels16_arm:
75
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
76
        @ block = word aligned, pixles = unaligned
77
        pld [r1]
78
        stmfd sp!, {r4-r11, lr} @ R14 is also called LR
79
        adr r5, 5f
80
        ands r4, r1, #3
81
        bic r1, r1, #3
82
        add r5, r5, r4, lsl #2
83
        ldrne pc, [r5]
84
1:
85
        ldmia r1, {r4-r7}
86
        add r1, r1, r2
87
        stmia r0, {r4-r7}
88
        pld [r1]
89
        subs r3, r3, #1
90
        add r0, r0, r2
91
        bne 1b
92
        ldmfd sp!, {r4-r11, pc}
93
        .align 8
94
2:
95
        ldmia r1, {r4-r8}
96
        add r1, r1, r2
97
        ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
98
        pld [r1]
99
        subs r3, r3, #1
100
        stmia r0, {r9-r12}
101
        add r0, r0, r2
102
        bne 2b
103
        ldmfd sp!, {r4-r11, pc}
104
        .align 8
105
3:
106
        ldmia r1, {r4-r8}
107
        add r1, r1, r2
108
        ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
109
        pld [r1]
110
        subs r3, r3, #1
111
        stmia r0, {r9-r12}
112
        add r0, r0, r2
113
        bne 3b
114
        ldmfd sp!, {r4-r11, pc}
115
        .align 8
116
4:
117
        ldmia r1, {r4-r8}
118
        add r1, r1, r2
119
        ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
120
        pld [r1]
121
        subs r3, r3, #1
122
        stmia r0, {r9-r12}
123
        add r0, r0, r2
124
        bne 4b
125
        ldmfd sp!, {r4-r11,pc}
126
        .align 8
127
5:
128
        .word 1b
129
        .word 2b
130
        .word 3b
131
        .word 4b
132

  
133
@ ----------------------------------------------------------------
134
        .align 8
135
        .global put_pixels8_arm
136
put_pixels8_arm:
137
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
138
        @ block = word aligned, pixles = unaligned
139
        pld [r1]
140
        stmfd sp!, {r4-r5,lr} @ R14 is also called LR
141
        adr r5, 5f
142
        ands r4, r1, #3
143
        bic r1, r1, #3
144
        add r5, r5, r4, lsl #2
145
        ldrne pc, [r5]
146
1:
147
        ldmia r1, {r4-r5}
148
        add r1, r1, r2
149
        subs r3, r3, #1
150
        pld [r1]
151
        stmia r0, {r4-r5}
152
        add r0, r0, r2
153
        bne 1b
154
        ldmfd sp!, {r4-r5,pc}
155
        .align 8
156
2:
157
        ldmia r1, {r4-r5, r12}
158
        add r1, r1, r2
159
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
160
        pld [r1]
161
        subs r3, r3, #1
162
        stmia r0, {r4-r5}
163
        add r0, r0, r2
164
        bne 2b
165
        ldmfd sp!, {r4-r5,pc}
166
        .align 8
167
3:
168
        ldmia r1, {r4-r5, r12}
169
        add r1, r1, r2
170
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
171
        pld [r1]
172
        subs r3, r3, #1
173
        stmia r0, {r4-r5}
174
        add r0, r0, r2
175
        bne 3b
176
        ldmfd sp!, {r4-r5,pc}
177
        .align 8
178
4:
179
        ldmia r1, {r4-r5, r12}
180
        add r1, r1, r2
181
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
182
        pld [r1]
183
        subs r3, r3, #1
184
        stmia r0, {r4-r5}
185
        add r0, r0, r2
186
        bne 4b
187
        ldmfd sp!, {r4-r5,pc}
188
        .align 8
189
5:
190
        .word 1b
191
        .word 2b
192
        .word 3b
193
        .word 4b
194

  
195
@ ----------------------------------------------------------------
196
        .align 8
197
        .global put_pixels8_x2_arm
198
put_pixels8_x2_arm:
199
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
200
        @ block = word aligned, pixles = unaligned
201
        pld [r1]
202
        stmfd sp!, {r4-r10,lr} @ R14 is also called LR
203
        adr r5, 5f
204
        ands r4, r1, #3
205
        ldr r12, [r5]
206
        add r5, r5, r4, lsl #2
207
        bic r1, r1, #3
208
        ldrne pc, [r5]
209
1:
210
        ldmia r1, {r4-r5, r10}
211
        add r1, r1, r2
212
        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
213
        pld [r1]
214
        RND_AVG32 r8, r9, r4, r5, r6, r7, r12
215
        subs r3, r3, #1
216
        stmia r0, {r8-r9}
217
        add r0, r0, r2
218
        bne 1b
219
        ldmfd sp!, {r4-r10,pc}
220
        .align 8
221
2:
222
        ldmia r1, {r4-r5, r10}
223
        add r1, r1, r2
224
        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
225
        ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
226
        pld [r1]
227
        RND_AVG32 r4, r5, r6, r7, r8, r9, r12
228
        subs r3, r3, #1
229
        stmia r0, {r4-r5}
230
        add r0, r0, r2
231
        bne 2b
232
        ldmfd sp!, {r4-r10,pc}
233
        .align 8
234
3:
235
        ldmia r1, {r4-r5, r10}
236
        add r1, r1, r2
237
        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
238
        ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
239
        pld [r1]
240
        RND_AVG32 r4, r5, r6, r7, r8, r9, r12
241
        subs r3, r3, #1
242
        stmia r0, {r4-r5}
243
        add r0, r0, r2
244
        bne 3b
245
        ldmfd sp!, {r4-r10,pc}
246
        .align 8
247
4:
248
        ldmia r1, {r4-r5, r10}
249
        add r1, r1, r2
250
        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
251
        pld [r1]
252
        RND_AVG32 r8, r9, r6, r7, r5, r10, r12
253
        subs r3, r3, #1
254
        stmia r0, {r8-r9}
255
        add r0, r0, r2
256
        bne 4b
257
        ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
258
        .align 8
259
5:
260
        .word 0xFEFEFEFE
261
        .word 2b
262
        .word 3b
263
        .word 4b
264

  
265
        .align 8
266
        .global put_no_rnd_pixels8_x2_arm
267
put_no_rnd_pixels8_x2_arm:
268
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
269
        @ block = word aligned, pixles = unaligned
270
        pld [r1]
271
        stmfd sp!, {r4-r10,lr} @ R14 is also called LR
272
        adr r5, 5f
273
        ands r4, r1, #3
274
        ldr r12, [r5]
275
        add r5, r5, r4, lsl #2
276
        bic r1, r1, #3
277
        ldrne pc, [r5]
278
1:
279
        ldmia r1, {r4-r5, r10}
280
        add r1, r1, r2
281
        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
282
        pld [r1]
283
        NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
284
        subs r3, r3, #1
285
        stmia r0, {r8-r9}
286
        add r0, r0, r2
287
        bne 1b
288
        ldmfd sp!, {r4-r10,pc}
289
        .align 8
290
2:
291
        ldmia r1, {r4-r5, r10}
292
        add r1, r1, r2
293
        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
294
        ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
295
        pld [r1]
296
        NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
297
        subs r3, r3, #1
298
        stmia r0, {r4-r5}
299
        add r0, r0, r2
300
        bne 2b
301
        ldmfd sp!, {r4-r10,pc}
302
        .align 8
303
3:
304
        ldmia r1, {r4-r5, r10}
305
        add r1, r1, r2
306
        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
307
        ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
308
        pld [r1]
309
        NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
310
        subs r3, r3, #1
311
        stmia r0, {r4-r5}
312
        add r0, r0, r2
313
        bne 3b
314
        ldmfd sp!, {r4-r10,pc}
315
        .align 8
316
4:
317
        ldmia r1, {r4-r5, r10}
318
        add r1, r1, r2
319
        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
320
        pld [r1]
321
        NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
322
        subs r3, r3, #1
323
        stmia r0, {r8-r9}
324
        add r0, r0, r2
325
        bne 4b
326
        ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
327
        .align 8
328
5:
329
        .word 0xFEFEFEFE
330
        .word 2b
331
        .word 3b
332
        .word 4b
333

  
334

  
335
@ ----------------------------------------------------------------
336
        .align 8
337
        .global put_pixels8_y2_arm
338
put_pixels8_y2_arm:
339
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
340
        @ block = word aligned, pixles = unaligned
341
        pld [r1]
342
        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
343
        adr r5, 5f
344
        ands r4, r1, #3
345
        mov r3, r3, lsr #1
346
        ldr r12, [r5]
347
        add r5, r5, r4, lsl #2
348
        bic r1, r1, #3
349
        ldrne pc, [r5]
350
1:
351
        ldmia r1, {r4-r5}
352
        add r1, r1, r2
353
6:      ldmia r1, {r6-r7}
354
        add r1, r1, r2
355
        pld [r1]
356
        RND_AVG32 r8, r9, r4, r5, r6, r7, r12
357
        ldmia r1, {r4-r5}
358
        add r1, r1, r2
359
        stmia r0, {r8-r9}
360
        add r0, r0, r2
361
        pld [r1]
362
        RND_AVG32 r8, r9, r6, r7, r4, r5, r12
363
        subs r3, r3, #1
364
        stmia r0, {r8-r9}
365
        add r0, r0, r2
366
        bne 6b
367
        ldmfd sp!, {r4-r11,pc}
368
        .align 8
369
2:
370
        ldmia r1, {r4-r6}
371
        add r1, r1, r2
372
        pld [r1]
373
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
374
6:      ldmia r1, {r7-r9}
375
        add r1, r1, r2
376
        pld [r1]
377
        ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
378
        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
379
        stmia r0, {r10-r11}
380
        add r0, r0, r2
381
        ldmia r1, {r4-r6}
382
        add r1, r1, r2
383
        pld [r1]
384
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
385
        subs r3, r3, #1
386
        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
387
        stmia r0, {r10-r11}
388
        add r0, r0, r2
389
        bne 6b
390
        ldmfd sp!, {r4-r11,pc}
391
        .align 8
392
3:
393
        ldmia r1, {r4-r6}
394
        add r1, r1, r2
395
        pld [r1]
396
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
397
6:      ldmia r1, {r7-r9}
398
        add r1, r1, r2
399
        pld [r1]
400
        ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
401
        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
402
        stmia r0, {r10-r11}
403
        add r0, r0, r2
404
        ldmia r1, {r4-r6}
405
        add r1, r1, r2
406
        pld [r1]
407
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
408
        subs r3, r3, #1
409
        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
410
        stmia r0, {r10-r11}
411
        add r0, r0, r2
412
        bne 6b
413
        ldmfd sp!, {r4-r11,pc}
414
        .align 8
415
4:
416
        ldmia r1, {r4-r6}
417
        add r1, r1, r2
418
        pld [r1]
419
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
420
6:      ldmia r1, {r7-r9}
421
        add r1, r1, r2
422
        pld [r1]
423
        ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
424
        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
425
        stmia r0, {r10-r11}
426
        add r0, r0, r2
427
        ldmia r1, {r4-r6}
428
        add r1, r1, r2
429
        pld [r1]
430
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
431
        subs r3, r3, #1
432
        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
433
        stmia r0, {r10-r11}
434
        add r0, r0, r2
435
        bne 6b
436
        ldmfd sp!, {r4-r11,pc}
437

  
438
        .align 8
439
5:
440
        .word 0xFEFEFEFE
441
        .word 2b
442
        .word 3b
443
        .word 4b
444

  
445
        .align 8
446
        .global put_no_rnd_pixels8_y2_arm
447
put_no_rnd_pixels8_y2_arm:
448
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
449
        @ block = word aligned, pixles = unaligned
450
        pld [r1]
451
        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
452
        adr r5, 5f
453
        ands r4, r1, #3
454
        mov r3, r3, lsr #1
455
        ldr r12, [r5]
456
        add r5, r5, r4, lsl #2
457
        bic r1, r1, #3
458
        ldrne pc, [r5]
459
1:
460
        ldmia r1, {r4-r5}
461
        add r1, r1, r2
462
6:      ldmia r1, {r6-r7}
463
        add r1, r1, r2
464
        pld [r1]
465
        NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
466
        ldmia r1, {r4-r5}
467
        add r1, r1, r2
468
        stmia r0, {r8-r9}
469
        add r0, r0, r2
470
        pld [r1]
471
        NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
472
        subs r3, r3, #1
473
        stmia r0, {r8-r9}
474
        add r0, r0, r2
475
        bne 6b
476
        ldmfd sp!, {r4-r11,pc}
477
        .align 8
478
2:
479
        ldmia r1, {r4-r6}
480
        add r1, r1, r2
481
        pld [r1]
482
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
483
6:      ldmia r1, {r7-r9}
484
        add r1, r1, r2
485
        pld [r1]
486
        ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
487
        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
488
        stmia r0, {r10-r11}
489
        add r0, r0, r2
490
        ldmia r1, {r4-r6}
491
        add r1, r1, r2
492
        pld [r1]
493
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
494
        subs r3, r3, #1
495
        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
496
        stmia r0, {r10-r11}
497
        add r0, r0, r2
498
        bne 6b
499
        ldmfd sp!, {r4-r11,pc}
500
        .align 8
501
3:
502
        ldmia r1, {r4-r6}
503
        add r1, r1, r2
504
        pld [r1]
505
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
506
6:      ldmia r1, {r7-r9}
507
        add r1, r1, r2
508
        pld [r1]
509
        ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
510
        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
511
        stmia r0, {r10-r11}
512
        add r0, r0, r2
513
        ldmia r1, {r4-r6}
514
        add r1, r1, r2
515
        pld [r1]
516
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
517
        subs r3, r3, #1
518
        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
519
        stmia r0, {r10-r11}
520
        add r0, r0, r2
521
        bne 6b
522
        ldmfd sp!, {r4-r11,pc}
523
        .align 8
524
4:
525
        ldmia r1, {r4-r6}
526
        add r1, r1, r2
527
        pld [r1]
528
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
529
6:      ldmia r1, {r7-r9}
530
        add r1, r1, r2
531
        pld [r1]
532
        ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
533
        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
534
        stmia r0, {r10-r11}
535
        add r0, r0, r2
536
        ldmia r1, {r4-r6}
537
        add r1, r1, r2
538
        pld [r1]
539
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
540
        subs r3, r3, #1
541
        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
542
        stmia r0, {r10-r11}
543
        add r0, r0, r2
544
        bne 6b
545
        ldmfd sp!, {r4-r11,pc}
546
        .align 8
547
5:
548
        .word 0xFEFEFEFE
549
        .word 2b
550
        .word 3b
551
        .word 4b
552

  
553
@ ----------------------------------------------------------------
554
.macro  RND_XY2_IT align, rnd
555
        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
556
        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
557
.if \align == 0
558
        ldmia r1, {r6-r8}
559
.elseif \align == 3
560
        ldmia r1, {r5-r7}
561
.else
562
        ldmia r1, {r8-r10}
563
.endif
564
        add r1, r1, r2
565
        pld [r1]
566
.if \align == 0
567
        ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
568
.elseif \align == 1
569
        ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
570
        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
571
.elseif \align == 2
572
        ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
573
        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
574
.elseif \align == 3
575
        ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
576
.endif
577
        ldr r14, [r12, #0]      @ 0x03030303
578
        tst r3, #1
579
        and r8, r4, r14
580
        and r9, r5, r14
581
        and r10, r6, r14
582
        and r11, r7, r14
583
.if \rnd == 1
584
        ldreq r14, [r12, #16]   @ 0x02020202
585
.else
586
        ldreq r14, [r12, #28]   @ 0x01010101
587
.endif
588
        add r8, r8, r10
589
        add r9, r9, r11
590
        addeq r8, r8, r14
591
        addeq r9, r9, r14
592
        ldr r14, [r12, #20]     @ 0xFCFCFCFC >> 2
593
        and r4, r14, r4, lsr #2
594
        and r5, r14, r5, lsr #2
595
        and r6, r14, r6, lsr #2
596
        and r7, r14, r7, lsr #2
597
        add r10, r4, r6
598
        add r11, r5, r7
599
.endm
600

  
601
.macro RND_XY2_EXPAND align, rnd
602
        RND_XY2_IT \align, \rnd
603
6:      stmfd sp!, {r8-r11}
604
        RND_XY2_IT \align, \rnd
605
        ldmfd sp!, {r4-r7}
606
        add r4, r4, r8
607
        add r5, r5, r9
608
        add r6, r6, r10
609
        add r7, r7, r11
610
        ldr r14, [r12, #24]     @ 0x0F0F0F0F
611
        and r4, r14, r4, lsr #2
612
        and r5, r14, r5, lsr #2
613
        add r4, r4, r6
614
        add r5, r5, r7
615
        subs r3, r3, #1
616
        stmia r0, {r4-r5}
617
        add r0, r0, r2
618
        bne 6b
619
        ldmfd sp!, {r4-r11,pc}
620
.endm
621

  
622
        .align 8
623
        .global put_pixels8_xy2_arm
624
put_pixels8_xy2_arm:
625
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
626
        @ block = word aligned, pixles = unaligned
627
        pld [r1]
628
        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
629
        adrl r12, 5f
630
        ands r4, r1, #3
631
        add r5, r12, r4, lsl #2
632
        bic r1, r1, #3
633
        ldrne pc, [r5]
634
1:
635
        RND_XY2_EXPAND 0, 1
636

  
637
        .align 8
638
2:
639
        RND_XY2_EXPAND 1, 1
640
        
641
        .align 8
642
3:
643
        RND_XY2_EXPAND 2, 1
644
        
645
        .align 8
646
4:
647
        RND_XY2_EXPAND 3, 1
648
        
649
5:
650
        .word 0x03030303
651
        .word 2b
652
        .word 3b
653
        .word 4b
654
        .word 0x02020202
655
        .word 0xFCFCFCFC >> 2
656
        .word 0x0F0F0F0F
657
        .word 0x01010101
658

  
659
        .align 8
660
        .global put_no_rnd_pixels8_xy2_arm
661
put_no_rnd_pixels8_xy2_arm:
662
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
663
        @ block = word aligned, pixles = unaligned
664
        pld [r1]
665
        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
666
        adrl r12, 5f
667
        ands r4, r1, #3
668
        add r5, r12, r4, lsl #2
669
        bic r1, r1, #3
670
        ldrne pc, [r5]
671
1:
672
        RND_XY2_EXPAND 0, 0
673

  
674
        .align 8
675
2:
676
        RND_XY2_EXPAND 1, 0
677
        
678
        .align 8
679
3:
680
        RND_XY2_EXPAND 2, 0
681
        
682
        .align 8
683
4:
684
        RND_XY2_EXPAND 3, 0
685
        
686
5:
687
        .word 0x03030303
688
        .word 2b
689
        .word 3b
690
        .word 4b
691
        .word 0x02020202
692
        .word 0xFCFCFCFC >> 2
693
        .word 0x0F0F0F0F
694
        .word 0x01010101
libavcodec/armv4l/dsputil_iwmmxt.c
1
/*
2
 * iWMMXt optimized DSP utils
3
 * Copyright (c) 2004 AGAWA Koji
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19

  
20
#include "../dsputil.h"
21

  
22
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
23
#define SET_RND(regd)  __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
24
#define WAVG2B "wavg2b"
25
#include "dsputil_iwmmxt_rnd.h"
26
#undef DEF
27
#undef SET_RND
28
#undef WAVG2B
29

  
30
#define DEF(x, y) x ## _ ## y ##_iwmmxt
31
#define SET_RND(regd)  __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
32
#define WAVG2B "wavg2br"
33
#include "dsputil_iwmmxt_rnd.h"
34
#undef DEF
35
#undef SET_RND
36
#undef WAVG2BR
37

  
38
// need scheduling
39
#define OP(AVG)                                         \
40
    asm volatile (                                      \
41
        /* alignment */                                 \
42
        "and r12, %[pixels], #7 \n\t"                   \
43
        "bic %[pixels], %[pixels], #7 \n\t"             \
44
        "tmcr wcgr1, r12 \n\t"                          \
45
                                                        \
46
        "wldrd wr0, [%[pixels]] \n\t"                   \
47
        "wldrd wr1, [%[pixels], #8] \n\t"               \
48
        "add %[pixels], %[pixels], %[line_size] \n\t"   \
49
        "walignr1 wr4, wr0, wr1 \n\t"                   \
50
                                                        \
51
        "1: \n\t"                                       \
52
                                                        \
53
        "wldrd wr2, [%[pixels]] \n\t"                   \
54
        "wldrd wr3, [%[pixels], #8] \n\t"               \
55
        "add %[pixels], %[pixels], %[line_size] \n\t"   \
56
        "pld [%[pixels]] \n\t"                          \
57
        "walignr1 wr5, wr2, wr3 \n\t"                   \
58
        AVG " wr6, wr4, wr5 \n\t"                       \
59
        "wstrd wr6, [%[block]] \n\t"                    \
60
        "add %[block], %[block], %[line_size] \n\t"     \
61
                                                        \
62
        "wldrd wr0, [%[pixels]] \n\t"                   \
63
        "wldrd wr1, [%[pixels], #8] \n\t"               \
64
        "add %[pixels], %[pixels], %[line_size] \n\t"   \
65
        "walignr1 wr4, wr0, wr1 \n\t"                   \
66
        "pld [%[pixels]] \n\t"                          \
67
        AVG " wr6, wr4, wr5 \n\t"                       \
68
        "wstrd wr6, [%[block]] \n\t"                    \
69
        "add %[block], %[block], %[line_size] \n\t"     \
70
                                                        \
71
        "subs %[h], %[h], #2 \n\t"                      \
72
        "bne 1b \n\t"                                   \
73
        : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)  \
74
        : [line_size]"r"(line_size) \
75
        : "memory", "r12");
76
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
77
{
78
    OP("wavg2br");
79
}
80
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
81
{
82
    OP("wavg2b");
83
}
84
#undef OP
85

  
86
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
87
{
88
    uint8_t *pixels2 = pixels + line_size;
89

  
90
    __asm__ __volatile__ (
91
        "mov            r12, #4                 \n\t"
92
        "1:                                     \n\t"
93
        "pld            [%[pixels], %[line_size2]]              \n\t"
94
        "pld            [%[pixels2], %[line_size2]]             \n\t"
95
        "wldrd          wr4, [%[pixels]]        \n\t"
96
        "wldrd          wr5, [%[pixels2]]       \n\t"
97
        "pld            [%[block], #32]         \n\t"
98
        "wunpckelub     wr6, wr4                \n\t"
99
        "wldrd          wr0, [%[block]]         \n\t"
100
        "wunpckehub     wr7, wr4                \n\t"
101
        "wldrd          wr1, [%[block], #8]     \n\t"
102
        "wunpckelub     wr8, wr5                \n\t"
103
        "wldrd          wr2, [%[block], #16]    \n\t"
104
        "wunpckehub     wr9, wr5                \n\t"
105
        "wldrd          wr3, [%[block], #24]    \n\t"
106
        "add            %[block], %[block], #32 \n\t"
107
        "waddhss        wr10, wr0, wr6          \n\t"
108
        "waddhss        wr11, wr1, wr7          \n\t"
109
        "waddhss        wr12, wr2, wr8          \n\t"
110
        "waddhss        wr13, wr3, wr9          \n\t"
111
        "wpackhus       wr14, wr10, wr11        \n\t"
112
        "wpackhus       wr15, wr12, wr13        \n\t"
113
        "wstrd          wr14, [%[pixels]]       \n\t"
114
        "add            %[pixels], %[pixels], %[line_size2]     \n\t"
115
        "subs           r12, r12, #1            \n\t"
116
        "wstrd          wr15, [%[pixels2]]      \n\t"
117
        "add            %[pixels2], %[pixels2], %[line_size2]   \n\t"
118
        "bne            1b                      \n\t"
119
        : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
120
        : [line_size2]"r"(line_size << 1)
121
        : "cc", "memory", "r12");
122
}
123

  
124
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
125
{
126
    return;
127
}
128

  
129
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
130
{
131
    c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
132

  
133
    c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
134
    c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
135
    c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
136
    c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
137
    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
138
    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
139
    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
140
    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
141

  
142
    c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
143
    c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
144
    c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
145
    c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
146
    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
147
    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
148
    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
149
    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
150

  
151
    c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
152
    c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
153
    c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
154
    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
155
    c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
156
    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
157
    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
158
    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
159

  
160
    c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
161
    c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
162
    c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
163
    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
164
    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
165
    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
166
    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
167
    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
168
}
libavcodec/armv4l/dsputil_iwmmxt_rnd.h
1
void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
2
{
3
    int stride = line_size;
4
    __asm__ __volatile__ (
5
        "and r12, %[pixels], #7 \n\t"
6
        "bic %[pixels], %[pixels], #7 \n\t"
7
        "tmcr wcgr1, r12 \n\t"
8
        "add r4, %[pixels], %[line_size] \n\t"
9
        "add r5, %[block], %[line_size] \n\t"
10
        "mov %[line_size], %[line_size], lsl #1 \n\t"
11
        "1: \n\t"
12
        "wldrd wr0, [%[pixels]] \n\t"
13
        "subs %[h], %[h], #2 \n\t"
14
        "wldrd wr1, [%[pixels], #8] \n\t"
15
        "add %[pixels], %[pixels], %[line_size] \n\t"
16
        "wldrd wr3, [r4] \n\t"
17
        "pld [%[pixels]] \n\t"
18
        "pld [%[pixels], #32] \n\t"
19
        "wldrd wr4, [r4, #8] \n\t"
20
        "add r4, r4, %[line_size] \n\t"
21
        "walignr1 wr8, wr0, wr1 \n\t"
22
        "pld [r4] \n\t"
23
        "pld [r4, #32] \n\t"
24
        "walignr1 wr10, wr3, wr4 \n\t"
25
        "wstrd wr8, [%[block]] \n\t"
26
        "add %[block], %[block], %[line_size] \n\t"
27
        "wstrd wr10, [r5] \n\t"
28
        "add r5, r5, %[line_size] \n\t"
29
        "bne 1b \n\t"
30
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
31
        :
32
        : "memory", "r4", "r5", "r12");
33
}
34

  
35
void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
36
{
37
    int stride = line_size;
38
    __asm__ __volatile__ (
39
        "and r12, %[pixels], #7 \n\t"
40
        "bic %[pixels], %[pixels], #7 \n\t"
41
        "tmcr wcgr1, r12 \n\t"
42
        "add r4, %[pixels], %[line_size] \n\t"
43
        "add r5, %[block], %[line_size] \n\t"
44
        "mov %[line_size], %[line_size], lsl #1 \n\t"
45
        "1: \n\t"
46
        "wldrd wr0, [%[pixels]] \n\t"
47
        "subs %[h], %[h], #2 \n\t"
48
        "wldrd wr1, [%[pixels], #8] \n\t"
49
        "add %[pixels], %[pixels], %[line_size] \n\t"
50
        "wldrd wr3, [r4] \n\t"
51
        "pld [%[pixels]] \n\t"
52
        "pld [%[pixels], #32] \n\t"
53
        "wldrd wr4, [r4, #8] \n\t"
54
        "add r4, r4, %[line_size] \n\t"
55
        "walignr1 wr8, wr0, wr1 \n\t"
56
        "wldrd wr0, [%[block]] \n\t"
57
        "wldrd wr2, [r5] \n\t"
58
        "pld [r4] \n\t"
59
        "pld [r4, #32] \n\t"
60
        "walignr1 wr10, wr3, wr4 \n\t"
61
        WAVG2B" wr8, wr8, wr0 \n\t"
62
        WAVG2B" wr10, wr10, wr2 \n\t"
63
        "wstrd wr8, [%[block]] \n\t"
64
        "add %[block], %[block], %[line_size] \n\t"
65
        "wstrd wr10, [r5] \n\t"
66
        "pld [%[block]] \n\t"
67
        "pld [%[block], #32] \n\t"
68
        "add r5, r5, %[line_size] \n\t"
69
        "pld [r5] \n\t"
70
        "pld [r5, #32] \n\t"
71
        "bne 1b \n\t"
72
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
73
        :
74
        : "memory", "r4", "r5", "r12");
75
}
76

  
77
void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
78
{
79
    int stride = line_size;
80
    __asm__ __volatile__ (
81
        "and r12, %[pixels], #7 \n\t"
82
        "bic %[pixels], %[pixels], #7 \n\t"
83
        "tmcr wcgr1, r12 \n\t"
84
        "add r4, %[pixels], %[line_size] \n\t"
85
        "add r5, %[block], %[line_size] \n\t"
86
        "mov %[line_size], %[line_size], lsl #1 \n\t"
87
        "1: \n\t"
88
        "wldrd wr0, [%[pixels]] \n\t"
89
        "wldrd wr1, [%[pixels], #8] \n\t"
90
        "subs %[h], %[h], #2 \n\t"
91
        "wldrd wr2, [%[pixels], #16] \n\t"
92
        "add %[pixels], %[pixels], %[line_size] \n\t"
93
        "wldrd wr3, [r4] \n\t"
94
        "pld [%[pixels]] \n\t"
95
        "pld [%[pixels], #32] \n\t"
96
        "walignr1 wr8, wr0, wr1 \n\t"
97
        "wldrd wr4, [r4, #8] \n\t"
98
        "walignr1 wr9, wr1, wr2 \n\t"
99
        "wldrd wr5, [r4, #16] \n\t"
100
        "add r4, r4, %[line_size] \n\t"
101
        "pld [r4] \n\t"
102
        "pld [r4, #32] \n\t"
103
        "walignr1 wr10, wr3, wr4 \n\t"
104
        "wstrd wr8, [%[block]] \n\t"
105
        "walignr1 wr11, wr4, wr5 \n\t"
106
        "wstrd wr9, [%[block], #8] \n\t"
107
        "add %[block], %[block], %[line_size] \n\t"
108
        "wstrd wr10, [r5] \n\t"
109
        "wstrd wr11, [r5, #8] \n\t"
110
        "add r5, r5, %[line_size] \n\t"
111
        "bne 1b \n\t"
112
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
113
        :
114
        : "memory", "r4", "r5", "r12");
115
}
116

  
117
void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
118
{
119
    int stride = line_size;
120
    __asm__ __volatile__ (
121
        "pld [%[pixels]]                \n\t"
122
        "pld [%[pixels], #32]           \n\t"
123
        "pld [%[block]]                 \n\t"
124
        "pld [%[block], #32]            \n\t"
125
        "and r12, %[pixels], #7         \n\t"
126
        "bic %[pixels], %[pixels], #7   \n\t"
127
        "tmcr wcgr1, r12                \n\t"
128
        "add r4, %[pixels], %[line_size]\n\t"
129
        "add r5, %[block], %[line_size] \n\t"
130
        "mov %[line_size], %[line_size], lsl #1 \n\t"
131
        "1:                             \n\t"
132
        "wldrd wr0, [%[pixels]]         \n\t"
133
        "wldrd wr1, [%[pixels], #8]     \n\t"
134
        "subs %[h], %[h], #2            \n\t"
135
        "wldrd wr2, [%[pixels], #16]    \n\t"
136
        "add %[pixels], %[pixels], %[line_size] \n\t"
137
        "wldrd wr3, [r4]                \n\t"
138
        "pld [%[pixels]]                \n\t"
139
        "pld [%[pixels], #32]           \n\t"
140
        "walignr1 wr8, wr0, wr1         \n\t"
141
        "wldrd wr4, [r4, #8]            \n\t"
142
        "walignr1 wr9, wr1, wr2         \n\t"
143
        "wldrd wr5, [r4, #16]           \n\t"
144
        "add r4, r4, %[line_size]       \n\t"
145
        "wldrd wr0, [%[block]]          \n\t"
146
        "pld [r4]                       \n\t"
147
        "wldrd wr1, [%[block], #8]      \n\t"
148
        "pld [r4, #32]                  \n\t"
149
        "wldrd wr2, [r5]                \n\t"
150
        "walignr1 wr10, wr3, wr4        \n\t"
151
        "wldrd wr3, [r5, #8]            \n\t"
152
        WAVG2B" wr8, wr8, wr0           \n\t"
153
        WAVG2B" wr9, wr9, wr1           \n\t"
154
        WAVG2B" wr10, wr10, wr2         \n\t"
155
        "wstrd wr8, [%[block]]          \n\t"
156
        "walignr1 wr11, wr4, wr5        \n\t"
157
        WAVG2B" wr11, wr11, wr3         \n\t"
158
        "wstrd wr9, [%[block], #8]      \n\t"
159
        "add %[block], %[block], %[line_size] \n\t"
160
        "wstrd wr10, [r5]               \n\t"
161
        "pld [%[block]]                 \n\t"
162
        "pld [%[block], #32]            \n\t"
163
        "wstrd wr11, [r5, #8]           \n\t"
164
        "add r5, r5, %[line_size]       \n\t"
165
        "pld [r5]                       \n\t"
166
        "pld [r5, #32]                  \n\t"
167
        "bne 1b \n\t"
168
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
169
        :
170
        : "memory", "r4", "r5", "r12");
171
}
172

  
173
void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
174
{
175
    int stride = line_size;
176
    // [wr0 wr1 wr2 wr3] for previous line
177
    // [wr4 wr5 wr6 wr7] for current line
178
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
179
    __asm__ __volatile__(
180
        "pld [%[pixels]]                \n\t"
181
        "pld [%[pixels], #32]           \n\t"
182
        "and r12, %[pixels], #7         \n\t"
183
        "bic %[pixels], %[pixels], #7   \n\t"
184
        "tmcr wcgr1, r12                \n\t"
185
        "add r12, r12, #1               \n\t"
186
        "add r4, %[pixels], %[line_size]\n\t"
187
        "tmcr wcgr2, r12                \n\t"
188
        "add r5, %[block], %[line_size] \n\t"
189
        "mov %[line_size], %[line_size], lsl #1 \n\t"
190

  
191
        "1:                             \n\t"
192
        "wldrd wr10, [%[pixels]]        \n\t"
193
        "cmp r12, #8                    \n\t"
194
        "wldrd wr11, [%[pixels], #8]    \n\t"
195
        "add %[pixels], %[pixels], %[line_size] \n\t"
196
        "wldrd wr13, [r4]               \n\t"
197
        "pld [%[pixels]]                \n\t"
198
        "wldrd wr14, [r4, #8]           \n\t"
199
        "pld [%[pixels], #32]           \n\t"
200
        "add r4, r4, %[line_size]       \n\t"
201
        "walignr1 wr0, wr10, wr11       \n\t"
202
        "pld [r4]                       \n\t"
203
        "pld [r4, #32]                  \n\t"
204
        "walignr1 wr2, wr13, wr14       \n\t"
205
        "wmoveq wr4, wr11               \n\t"
206
        "wmoveq wr6, wr14               \n\t"
207
        "walignr2ne wr4, wr10, wr11     \n\t"
208
        "walignr2ne wr6, wr13, wr14     \n\t"
209
        WAVG2B" wr0, wr0, wr4           \n\t"
210
        WAVG2B" wr2, wr2, wr6           \n\t"
211
        "wstrd wr0, [%[block]]          \n\t"
212
        "subs %[h], %[h], #2            \n\t"
213
        "wstrd wr2, [r5]                \n\t"
214
        "add %[block], %[block], %[line_size]   \n\t"
215
        "add r5, r5, %[line_size]       \n\t"
216
        "bne 1b                         \n\t"
217
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
218
        :
219
        : "r4", "r5", "r12", "memory");
220
}
221

  
222
void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
223
{
224
    int stride = line_size;
225
    // [wr0 wr1 wr2 wr3] for previous line
226
    // [wr4 wr5 wr6 wr7] for current line
227
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
228
    __asm__ __volatile__(
229
        "pld [%[pixels]]                \n\t"
230
        "pld [%[pixels], #32]           \n\t"
231
        "and r12, %[pixels], #7         \n\t"
232
        "bic %[pixels], %[pixels], #7   \n\t"
233
        "tmcr wcgr1, r12                \n\t"
234
        "add r12, r12, #1               \n\t"
235
        "add r4, %[pixels], %[line_size]\n\t"
236
        "tmcr wcgr2, r12                \n\t"
237
        "add r5, %[block], %[line_size] \n\t"
238
        "mov %[line_size], %[line_size], lsl #1 \n\t"
239

  
240
        "1:                             \n\t"
241
        "wldrd wr10, [%[pixels]]        \n\t"
242
        "cmp r12, #8                    \n\t"
243
        "wldrd wr11, [%[pixels], #8]    \n\t"
244
        "wldrd wr12, [%[pixels], #16]   \n\t"
245
        "add %[pixels], %[pixels], %[line_size] \n\t"
246
        "wldrd wr13, [r4]               \n\t"
247
        "pld [%[pixels]]                \n\t"
248
        "wldrd wr14, [r4, #8]           \n\t"
249
        "pld [%[pixels], #32]           \n\t"
250
        "wldrd wr15, [r4, #16]          \n\t"
251
        "add r4, r4, %[line_size]       \n\t"
252
        "walignr1 wr0, wr10, wr11       \n\t"
253
        "pld [r4]                       \n\t"
254
        "pld [r4, #32]                  \n\t"
255
        "walignr1 wr1, wr11, wr12       \n\t"
256
        "walignr1 wr2, wr13, wr14       \n\t"
257
        "walignr1 wr3, wr14, wr15       \n\t"
258
        "wmoveq wr4, wr11               \n\t"
259
        "wmoveq wr5, wr12               \n\t"
260
        "wmoveq wr6, wr14               \n\t"
261
        "wmoveq wr7, wr15               \n\t"
262
        "walignr2ne wr4, wr10, wr11     \n\t"
263
        "walignr2ne wr5, wr11, wr12     \n\t"
264
        "walignr2ne wr6, wr13, wr14     \n\t"
265
        "walignr2ne wr7, wr14, wr15     \n\t"
266
        WAVG2B" wr0, wr0, wr4           \n\t"
267
        WAVG2B" wr1, wr1, wr5           \n\t"
268
        "wstrd wr0, [%[block]]          \n\t"
269
        WAVG2B" wr2, wr2, wr6           \n\t"
270
        "wstrd wr1, [%[block], #8]      \n\t"
271
        WAVG2B" wr3, wr3, wr7           \n\t"
272
        "add %[block], %[block], %[line_size]   \n\t"
273
        "wstrd wr2, [r5]                \n\t"
274
        "subs %[h], %[h], #2            \n\t"
275
        "wstrd wr3, [r5, #8]            \n\t"
276
        "add r5, r5, %[line_size]       \n\t"
277
        "bne 1b                         \n\t"
278
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
279
        :
280
        : "r4", "r5", "r12", "memory");
281
}
282

  
283
void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
284
{
285
    int stride = line_size;
286
    // [wr0 wr1 wr2 wr3] for previous line
287
    // [wr4 wr5 wr6 wr7] for current line
288
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
289
    __asm__ __volatile__(
290
        "pld [%[pixels]]                \n\t"
291
        "pld [%[pixels], #32]           \n\t"
292
        "pld [%[block]]                 \n\t"
293
        "pld [%[block], #32]            \n\t"
294
        "and r12, %[pixels], #7         \n\t"
295
        "bic %[pixels], %[pixels], #7   \n\t"
296
        "tmcr wcgr1, r12                \n\t"
297
        "add r12, r12, #1               \n\t"
298
        "add r4, %[pixels], %[line_size]\n\t"
299
        "tmcr wcgr2, r12                \n\t"
300
        "add r5, %[block], %[line_size] \n\t"
301
        "mov %[line_size], %[line_size], lsl #1 \n\t"
302
        "pld [r5]                       \n\t"
303
        "pld [r5, #32]                  \n\t"
304

  
305
        "1:                             \n\t"
306
        "wldrd wr10, [%[pixels]]        \n\t"
307
        "cmp r12, #8                    \n\t"
308
        "wldrd wr11, [%[pixels], #8]    \n\t"
309
        "add %[pixels], %[pixels], %[line_size] \n\t"
310
        "wldrd wr13, [r4]               \n\t"
311
        "pld [%[pixels]]                \n\t"
312
        "wldrd wr14, [r4, #8]           \n\t"
313
        "pld [%[pixels], #32]           \n\t"
314
        "add r4, r4, %[line_size]       \n\t"
315
        "walignr1 wr0, wr10, wr11       \n\t"
316
        "pld [r4]                       \n\t"
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff