Revision a2fc0f6a

View differences:

configure
789 789

  
790 790
ARCH_LIST='
791 791
    alpha
792
    armv4l
792
    arm
793 793
    bfin
794 794
    ia64
795 795
    m68k
......
933 933

  
934 934
# architecture extensions
935 935
altivec_deps="powerpc"
936
armv5te_deps="armv4l"
937
armv6_deps="armv4l"
938
armvfp_deps="armv4l"
939
iwmmxt_deps="armv4l"
936
armv5te_deps="arm"
937
armv6_deps="arm"
938
armvfp_deps="arm"
939
iwmmxt_deps="arm"
940 940
mmi_deps="mips"
941 941
mmx_deps="x86"
942 942
mmx2_deps="x86 mmx"
943
neon_deps="armv4l"
943
neon_deps="arm"
944 944
ssse3_deps="x86"
945 945
vis_deps="sparc"
946 946

  
......
1309 1309
        int test[sizeof(char*) - 7];
1310 1310
EOF
1311 1311
    ;;
1312
    # armv4l is a subset of armv[567]*l
1313 1312
    arm|armv[4567]*l)
1314
        arch="armv4l"
1313
        arch="arm"
1315 1314
    ;;
1316 1315
    alpha)
1317 1316
        arch="alpha"
......
1785 1784
fi
1786 1785

  
1787 1786
# We have to check if pld is a nop and disable it.
1788
enabled armv4l  && check_asm pld     '"pld [r0]"'
1787
enabled arm     && check_asm pld     '"pld [r0]"'
1789 1788
enabled armv5te && check_asm armv5te '"qadd r0, r0, r0"'
1790 1789
enabled armv6   && check_asm armv6   '"sadd16 r0, r0, r0"'
1791 1790
enabled armvfp  && check_asm armvfp  '"fadds s0, s0, s0"'
......
2136 2135
    echo "EBX available             ${ebx_available-no}"
2137 2136
    echo "EBP available             ${ebp_available-no}"
2138 2137
fi
2139
if test $arch = "armv4l"; then
2138
if test $arch = "arm"; then
2140 2139
    echo "ARMv5TE enabled           ${armv5te-no}"
2141 2140
    echo "ARMv6 enabled             ${armv6-no}"
2142 2141
    echo "ARM VFP enabled           ${armvfp-no}"
......
2358 2357
        doc               \
2359 2358
        libavcodec        \
2360 2359
        libavcodec/alpha  \
2361
        libavcodec/armv4l \
2360
        libavcodec/arm    \
2362 2361
        libavcodec/bfin   \
2363 2362
        libavcodec/i386   \
2364 2363
        libavcodec/mlib   \
libavcodec/Makefile
429 429
                                          alpha/mpegvideo_alpha.o       \
430 430
                                          alpha/simple_idct_alpha.o     \
431 431

  
432
OBJS-$(ARCH_ARMV4L)                    += armv4l/dsputil_arm.o          \
433
                                          armv4l/dsputil_arm_s.o        \
434
                                          armv4l/jrevdct_arm.o          \
435
                                          armv4l/mpegvideo_arm.o        \
436
                                          armv4l/simple_idct_arm.o      \
432
OBJS-$(ARCH_ARM)                       += arm/dsputil_arm.o             \
433
                                          arm/dsputil_arm_s.o           \
434
                                          arm/jrevdct_arm.o             \
435
                                          arm/mpegvideo_arm.o           \
436
                                          arm/simple_idct_arm.o         \
437 437

  
438
OBJS-$(HAVE_ARMV5TE)                   += armv4l/mpegvideo_armv5te.o    \
439
                                          armv4l/mpegvideo_armv5te_s.o  \
440
                                          armv4l/simple_idct_armv5te.o  \
438
OBJS-$(HAVE_ARMV5TE)                   += arm/mpegvideo_armv5te.o       \
439
                                          arm/mpegvideo_armv5te_s.o     \
440
                                          arm/simple_idct_armv5te.o     \
441 441

  
442
OBJS-$(HAVE_ARMV6)                     += armv4l/simple_idct_armv6.o    \
442
OBJS-$(HAVE_ARMV6)                     += arm/simple_idct_armv6.o       \
443 443

  
444
OBJS-$(HAVE_ARMVFP)                    += armv4l/dsputil_vfp.o          \
445
                                          armv4l/float_arm_vfp.o        \
444
OBJS-$(HAVE_ARMVFP)                    += arm/dsputil_vfp.o             \
445
                                          arm/float_arm_vfp.o           \
446 446

  
447
OBJS-$(HAVE_IWMMXT)                    += armv4l/dsputil_iwmmxt.o       \
448
                                          armv4l/mpegvideo_iwmmxt.o     \
447
OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
448
                                          arm/mpegvideo_iwmmxt.o        \
449 449

  
450
OBJS-$(HAVE_NEON)                      += armv4l/dsputil_neon.o         \
451
                                          armv4l/dsputil_neon_s.o       \
452
                                          armv4l/h264dsp_neon.o         \
453
                                          armv4l/h264idct_neon.o        \
454
                                          armv4l/simple_idct_neon.o     \
450
OBJS-$(HAVE_NEON)                      += arm/dsputil_neon.o            \
451
                                          arm/dsputil_neon_s.o          \
452
                                          arm/h264dsp_neon.o            \
453
                                          arm/h264idct_neon.o           \
454
                                          arm/simple_idct_neon.o        \
455 455

  
456 456
OBJS-$(ARCH_BFIN)                      += bfin/dsputil_bfin.o           \
457 457
                                          bfin/fdct_bfin.o              \
......
499 499
TESTS-$(ARCH_X86) += i386/cpuid-test$(EXESUF) motion-test$(EXESUF)
500 500

  
501 501
CLEANFILES = apiexample$(EXESUF)
502
DIRS = alpha armv4l bfin i386 mlib ppc ps2 sh4 sparc
502
DIRS = alpha arm bfin i386 mlib ppc ps2 sh4 sparc
503 503

  
504 504
include $(SUBDIR)../subdir.mak
505 505

  
libavcodec/arm/asm.S
1
/*
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

  
21
        .macro require8, val=1
22
        .eabi_attribute 24, \val
23
        .endm
24

  
25
        .macro preserve8, val=1
26
        .eabi_attribute 25, \val
27
        .endm
28

  
29
        .macro function name, export=0
30
.if \export
31
        .global \name
32
.endif
33
        .type   \name, %function
34
        .func   \name
35
\name:
36
        .endm
libavcodec/arm/dsputil_arm.c
1
/*
2
 * ARM optimized DSP utils
3
 * Copyright (c) 2001 Lionel Ulmer.
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

  
22
#include "libavcodec/dsputil.h"
23
#ifdef HAVE_IPP
24
#include <ipp.h>
25
#endif
26

  
27
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
28
void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
29
void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
30

  
31
void j_rev_dct_ARM(DCTELEM *data);
32
void simple_idct_ARM(DCTELEM *data);
33

  
34
void simple_idct_armv5te(DCTELEM *data);
35
void simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
36
void simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
37

  
38
void ff_simple_idct_armv6(DCTELEM *data);
39
void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data);
40
void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data);
41

  
42
void ff_simple_idct_neon(DCTELEM *data);
43
void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
44
void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
45

  
46
/* XXX: local hack */
47
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
48
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
49

  
50
void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
51
void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
52
void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
53
void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
54

  
55
void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
56
void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
57
void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
58

  
59
void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
60

  
61
void ff_prefetch_arm(void *mem, int stride, int h);
62

  
63
CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8)
64
CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8)
65
CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8)
66
CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8)
67
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8)
68
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8)
69

  
70
void ff_add_pixels_clamped_ARM(short *block, unsigned char *dest,
71
                                      int line_size);
72

  
73
/* XXX: those functions should be suppressed ASAP when all IDCTs are
74
   converted */
75
static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
76
{
77
    j_rev_dct_ARM (block);
78
    ff_put_pixels_clamped(block, dest, line_size);
79
}
80
static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
81
{
82
    j_rev_dct_ARM (block);
83
    ff_add_pixels_clamped(block, dest, line_size);
84
}
85
static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
86
{
87
    simple_idct_ARM (block);
88
    ff_put_pixels_clamped(block, dest, line_size);
89
}
90
static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
91
{
92
    simple_idct_ARM (block);
93
    ff_add_pixels_clamped(block, dest, line_size);
94
}
95

  
96
#ifdef HAVE_IPP
97
static void simple_idct_ipp(DCTELEM *block)
98
{
99
    ippiDCT8x8Inv_Video_16s_C1I(block);
100
}
101
static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
102
{
103
    ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
104
}
105

  
106
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
107

  
108
static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
109
{
110
    ippiDCT8x8Inv_Video_16s_C1I(block);
111
#ifdef HAVE_IWMMXT
112
    add_pixels_clamped_iwmmxt(block, dest, line_size);
113
#else
114
    ff_add_pixels_clamped_ARM(block, dest, line_size);
115
#endif
116
}
117
#endif
118

  
119
int mm_support(void)
120
{
121
    return ENABLE_IWMMXT * FF_MM_IWMMXT;
122
}
123

  
124
void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
125
{
126
    int idct_algo= avctx->idct_algo;
127

  
128
    ff_put_pixels_clamped = c->put_pixels_clamped;
129
    ff_add_pixels_clamped = c->add_pixels_clamped;
130

  
131
    if (avctx->lowres == 0) {
132
        if(idct_algo == FF_IDCT_AUTO){
133
#if defined(HAVE_IPP)
134
            idct_algo = FF_IDCT_IPP;
135
#elif defined(HAVE_NEON)
136
            idct_algo = FF_IDCT_SIMPLENEON;
137
#elif defined(HAVE_ARMV6)
138
            idct_algo = FF_IDCT_SIMPLEARMV6;
139
#elif defined(HAVE_ARMV5TE)
140
            idct_algo = FF_IDCT_SIMPLEARMV5TE;
141
#else
142
            idct_algo = FF_IDCT_ARM;
143
#endif
144
        }
145

  
146
        if(idct_algo==FF_IDCT_ARM){
147
            c->idct_put= j_rev_dct_ARM_put;
148
            c->idct_add= j_rev_dct_ARM_add;
149
            c->idct    = j_rev_dct_ARM;
150
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
151
        } else if (idct_algo==FF_IDCT_SIMPLEARM){
152
            c->idct_put= simple_idct_ARM_put;
153
            c->idct_add= simple_idct_ARM_add;
154
            c->idct    = simple_idct_ARM;
155
            c->idct_permutation_type= FF_NO_IDCT_PERM;
156
#ifdef HAVE_ARMV6
157
        } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
158
            c->idct_put= ff_simple_idct_put_armv6;
159
            c->idct_add= ff_simple_idct_add_armv6;
160
            c->idct    = ff_simple_idct_armv6;
161
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
162
#endif
163
#ifdef HAVE_ARMV5TE
164
        } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
165
            c->idct_put= simple_idct_put_armv5te;
166
            c->idct_add= simple_idct_add_armv5te;
167
            c->idct    = simple_idct_armv5te;
168
            c->idct_permutation_type = FF_NO_IDCT_PERM;
169
#endif
170
#ifdef HAVE_IPP
171
        } else if (idct_algo==FF_IDCT_IPP){
172
            c->idct_put= simple_idct_ipp_put;
173
            c->idct_add= simple_idct_ipp_add;
174
            c->idct    = simple_idct_ipp;
175
            c->idct_permutation_type= FF_NO_IDCT_PERM;
176
#endif
177
#ifdef HAVE_NEON
178
        } else if (idct_algo==FF_IDCT_SIMPLENEON){
179
            c->idct_put= ff_simple_idct_put_neon;
180
            c->idct_add= ff_simple_idct_add_neon;
181
            c->idct    = ff_simple_idct_neon;
182
            c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
183
#endif
184
        }
185
    }
186

  
187
    c->put_pixels_tab[0][0] = put_pixels16_arm;
188
    c->put_pixels_tab[0][1] = put_pixels16_x2_arm;
189
    c->put_pixels_tab[0][2] = put_pixels16_y2_arm;
190
    c->put_pixels_tab[0][3] = put_pixels16_xy2_arm;
191
    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm;
192
    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm;
193
    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm;
194
    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm;
195
    c->put_pixels_tab[1][0] = put_pixels8_arm;
196
    c->put_pixels_tab[1][1] = put_pixels8_x2_arm;
197
    c->put_pixels_tab[1][2] = put_pixels8_y2_arm;
198
    c->put_pixels_tab[1][3] = put_pixels8_xy2_arm;
199
    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;
200
    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm;
201
    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm;
202
    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;
203

  
204
#ifdef HAVE_ARMV5TE
205
    c->prefetch = ff_prefetch_arm;
206
#endif
207

  
208
#ifdef HAVE_IWMMXT
209
    dsputil_init_iwmmxt(c, avctx);
210
#endif
211
#ifdef HAVE_ARMVFP
212
    ff_float_init_arm_vfp(c, avctx);
213
#endif
214
#ifdef HAVE_NEON
215
    ff_dsputil_init_neon(c, avctx);
216
#endif
217
}
libavcodec/arm/dsputil_arm_s.S
1
@
2
@ ARMv4 optimized DSP utils
3
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
4
@
5
@ This file is part of FFmpeg.
6
@
7
@ FFmpeg is free software; you can redistribute it and/or
8
@ modify it under the terms of the GNU Lesser General Public
9
@ License as published by the Free Software Foundation; either
10
@ version 2.1 of the License, or (at your option) any later version.
11
@
12
@ FFmpeg is distributed in the hope that it will be useful,
13
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
@ Lesser General Public License for more details.
16
@
17
@ You should have received a copy of the GNU Lesser General Public
18
@ License along with FFmpeg; if not, write to the Free Software
19
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
@
21

  
22
#include "config.h"
23
#include "asm.S"
24

  
25
        preserve8
26

  
27
#ifndef HAVE_PLD
28
.macro pld reg
29
.endm
30
#endif
31

  
32
#ifdef HAVE_ARMV5TE
33
function ff_prefetch_arm, export=1
34
        subs    r2, r2, #1
35
        pld     [r0]
36
        add     r0, r0, r1
37
        bne     ff_prefetch_arm
38
        bx      lr
39
        .endfunc
40
#endif
41

  
42
.macro  ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
43
        mov \Rd0, \Rn0, lsr #(\shift * 8)
44
        mov \Rd1, \Rn1, lsr #(\shift * 8)
45
        mov \Rd2, \Rn2, lsr #(\shift * 8)
46
        mov \Rd3, \Rn3, lsr #(\shift * 8)
47
        orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
48
        orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
49
        orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
50
        orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
51
.endm
52
.macro  ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
53
        mov \R0, \R0, lsr #(\shift * 8)
54
        orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
55
        mov \R1, \R1, lsr #(\shift * 8)
56
        orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
57
.endm
58
.macro  ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
59
        mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
60
        mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
61
        orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
62
        orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
63
.endm
64

  
65
.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
66
        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
67
        @ Rmask = 0xFEFEFEFE
68
        @ Rn = destroy
69
        eor \Rd0, \Rn0, \Rm0
70
        eor \Rd1, \Rn1, \Rm1
71
        orr \Rn0, \Rn0, \Rm0
72
        orr \Rn1, \Rn1, \Rm1
73
        and \Rd0, \Rd0, \Rmask
74
        and \Rd1, \Rd1, \Rmask
75
        sub \Rd0, \Rn0, \Rd0, lsr #1
76
        sub \Rd1, \Rn1, \Rd1, lsr #1
77
.endm
78

  
79
.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
80
        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
81
        @ Rmask = 0xFEFEFEFE
82
        @ Rn = destroy
83
        eor \Rd0, \Rn0, \Rm0
84
        eor \Rd1, \Rn1, \Rm1
85
        and \Rn0, \Rn0, \Rm0
86
        and \Rn1, \Rn1, \Rm1
87
        and \Rd0, \Rd0, \Rmask
88
        and \Rd1, \Rd1, \Rmask
89
        add \Rd0, \Rn0, \Rd0, lsr #1
90
        add \Rd1, \Rn1, \Rd1, lsr #1
91
.endm
92

  
93
@ ----------------------------------------------------------------
94
        .align 8
95
function put_pixels16_arm, export=1
96
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
97
        @ block = word aligned, pixles = unaligned
98
        pld [r1]
99
        stmfd sp!, {r4-r11, lr} @ R14 is also called LR
100
        adr r5, 5f
101
        ands r4, r1, #3
102
        bic r1, r1, #3
103
        add r5, r5, r4, lsl #2
104
        ldrne pc, [r5]
105
1:
106
        ldmia r1, {r4-r7}
107
        add r1, r1, r2
108
        stmia r0, {r4-r7}
109
        pld [r1]
110
        subs r3, r3, #1
111
        add r0, r0, r2
112
        bne 1b
113
        ldmfd sp!, {r4-r11, pc}
114
        .align 8
115
2:
116
        ldmia r1, {r4-r8}
117
        add r1, r1, r2
118
        ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
119
        pld [r1]
120
        subs r3, r3, #1
121
        stmia r0, {r9-r12}
122
        add r0, r0, r2
123
        bne 2b
124
        ldmfd sp!, {r4-r11, pc}
125
        .align 8
126
3:
127
        ldmia r1, {r4-r8}
128
        add r1, r1, r2
129
        ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
130
        pld [r1]
131
        subs r3, r3, #1
132
        stmia r0, {r9-r12}
133
        add r0, r0, r2
134
        bne 3b
135
        ldmfd sp!, {r4-r11, pc}
136
        .align 8
137
4:
138
        ldmia r1, {r4-r8}
139
        add r1, r1, r2
140
        ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
141
        pld [r1]
142
        subs r3, r3, #1
143
        stmia r0, {r9-r12}
144
        add r0, r0, r2
145
        bne 4b
146
        ldmfd sp!, {r4-r11,pc}
147
        .align 8
148
5:
149
        .word 1b
150
        .word 2b
151
        .word 3b
152
        .word 4b
153
        .endfunc
154

  
155
@ ----------------------------------------------------------------
156
        .align 8
157
function put_pixels8_arm, export=1
158
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
159
        @ block = word aligned, pixles = unaligned
160
        pld [r1]
161
        stmfd sp!, {r4-r5,lr} @ R14 is also called LR
162
        adr r5, 5f
163
        ands r4, r1, #3
164
        bic r1, r1, #3
165
        add r5, r5, r4, lsl #2
166
        ldrne pc, [r5]
167
1:
168
        ldmia r1, {r4-r5}
169
        add r1, r1, r2
170
        subs r3, r3, #1
171
        pld [r1]
172
        stmia r0, {r4-r5}
173
        add r0, r0, r2
174
        bne 1b
175
        ldmfd sp!, {r4-r5,pc}
176
        .align 8
177
2:
178
        ldmia r1, {r4-r5, r12}
179
        add r1, r1, r2
180
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
181
        pld [r1]
182
        subs r3, r3, #1
183
        stmia r0, {r4-r5}
184
        add r0, r0, r2
185
        bne 2b
186
        ldmfd sp!, {r4-r5,pc}
187
        .align 8
188
3:
189
        ldmia r1, {r4-r5, r12}
190
        add r1, r1, r2
191
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
192
        pld [r1]
193
        subs r3, r3, #1
194
        stmia r0, {r4-r5}
195
        add r0, r0, r2
196
        bne 3b
197
        ldmfd sp!, {r4-r5,pc}
198
        .align 8
199
4:
200
        ldmia r1, {r4-r5, r12}
201
        add r1, r1, r2
202
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
203
        pld [r1]
204
        subs r3, r3, #1
205
        stmia r0, {r4-r5}
206
        add r0, r0, r2
207
        bne 4b
208
        ldmfd sp!, {r4-r5,pc}
209
        .align 8
210
5:
211
        .word 1b
212
        .word 2b
213
        .word 3b
214
        .word 4b
215
        .endfunc
216

  
217
@ ----------------------------------------------------------------
218
        .align 8
219
function put_pixels8_x2_arm, export=1
220
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
221
        @ block = word aligned, pixles = unaligned
222
        pld [r1]
223
        stmfd sp!, {r4-r10,lr} @ R14 is also called LR
224
        adr r5, 5f
225
        ands r4, r1, #3
226
        ldr r12, [r5]
227
        add r5, r5, r4, lsl #2
228
        bic r1, r1, #3
229
        ldrne pc, [r5]
230
1:
231
        ldmia r1, {r4-r5, r10}
232
        add r1, r1, r2
233
        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
234
        pld [r1]
235
        RND_AVG32 r8, r9, r4, r5, r6, r7, r12
236
        subs r3, r3, #1
237
        stmia r0, {r8-r9}
238
        add r0, r0, r2
239
        bne 1b
240
        ldmfd sp!, {r4-r10,pc}
241
        .align 8
242
2:
243
        ldmia r1, {r4-r5, r10}
244
        add r1, r1, r2
245
        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
246
        ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
247
        pld [r1]
248
        RND_AVG32 r4, r5, r6, r7, r8, r9, r12
249
        subs r3, r3, #1
250
        stmia r0, {r4-r5}
251
        add r0, r0, r2
252
        bne 2b
253
        ldmfd sp!, {r4-r10,pc}
254
        .align 8
255
3:
256
        ldmia r1, {r4-r5, r10}
257
        add r1, r1, r2
258
        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
259
        ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
260
        pld [r1]
261
        RND_AVG32 r4, r5, r6, r7, r8, r9, r12
262
        subs r3, r3, #1
263
        stmia r0, {r4-r5}
264
        add r0, r0, r2
265
        bne 3b
266
        ldmfd sp!, {r4-r10,pc}
267
        .align 8
268
4:
269
        ldmia r1, {r4-r5, r10}
270
        add r1, r1, r2
271
        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
272
        pld [r1]
273
        RND_AVG32 r8, r9, r6, r7, r5, r10, r12
274
        subs r3, r3, #1
275
        stmia r0, {r8-r9}
276
        add r0, r0, r2
277
        bne 4b
278
        ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
279
        .align 8
280
5:
281
        .word 0xFEFEFEFE
282
        .word 2b
283
        .word 3b
284
        .word 4b
285
        .endfunc
286

  
287
        .align 8
288
function put_no_rnd_pixels8_x2_arm, export=1
289
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
290
        @ block = word aligned, pixles = unaligned
291
        pld [r1]
292
        stmfd sp!, {r4-r10,lr} @ R14 is also called LR
293
        adr r5, 5f
294
        ands r4, r1, #3
295
        ldr r12, [r5]
296
        add r5, r5, r4, lsl #2
297
        bic r1, r1, #3
298
        ldrne pc, [r5]
299
1:
300
        ldmia r1, {r4-r5, r10}
301
        add r1, r1, r2
302
        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
303
        pld [r1]
304
        NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
305
        subs r3, r3, #1
306
        stmia r0, {r8-r9}
307
        add r0, r0, r2
308
        bne 1b
309
        ldmfd sp!, {r4-r10,pc}
310
        .align 8
311
2:
312
        ldmia r1, {r4-r5, r10}
313
        add r1, r1, r2
314
        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
315
        ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
316
        pld [r1]
317
        NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
318
        subs r3, r3, #1
319
        stmia r0, {r4-r5}
320
        add r0, r0, r2
321
        bne 2b
322
        ldmfd sp!, {r4-r10,pc}
323
        .align 8
324
3:
325
        ldmia r1, {r4-r5, r10}
326
        add r1, r1, r2
327
        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
328
        ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
329
        pld [r1]
330
        NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
331
        subs r3, r3, #1
332
        stmia r0, {r4-r5}
333
        add r0, r0, r2
334
        bne 3b
335
        ldmfd sp!, {r4-r10,pc}
336
        .align 8
337
4:
338
        ldmia r1, {r4-r5, r10}
339
        add r1, r1, r2
340
        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
341
        pld [r1]
342
        NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
343
        subs r3, r3, #1
344
        stmia r0, {r8-r9}
345
        add r0, r0, r2
346
        bne 4b
347
        ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
348
        .align 8
349
5:
350
        .word 0xFEFEFEFE
351
        .word 2b
352
        .word 3b
353
        .word 4b
354
        .endfunc
355

  
356

  
357
@ ----------------------------------------------------------------
358
        .align 8
359
function put_pixels8_y2_arm, export=1
360
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
361
        @ block = word aligned, pixles = unaligned
362
        pld [r1]
363
        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
364
        adr r5, 5f
365
        ands r4, r1, #3
366
        mov r3, r3, lsr #1
367
        ldr r12, [r5]
368
        add r5, r5, r4, lsl #2
369
        bic r1, r1, #3
370
        ldrne pc, [r5]
371
1:
372
        ldmia r1, {r4-r5}
373
        add r1, r1, r2
374
6:      ldmia r1, {r6-r7}
375
        add r1, r1, r2
376
        pld [r1]
377
        RND_AVG32 r8, r9, r4, r5, r6, r7, r12
378
        ldmia r1, {r4-r5}
379
        add r1, r1, r2
380
        stmia r0, {r8-r9}
381
        add r0, r0, r2
382
        pld [r1]
383
        RND_AVG32 r8, r9, r6, r7, r4, r5, r12
384
        subs r3, r3, #1
385
        stmia r0, {r8-r9}
386
        add r0, r0, r2
387
        bne 6b
388
        ldmfd sp!, {r4-r11,pc}
389
        .align 8
390
2:
391
        ldmia r1, {r4-r6}
392
        add r1, r1, r2
393
        pld [r1]
394
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
395
6:      ldmia r1, {r7-r9}
396
        add r1, r1, r2
397
        pld [r1]
398
        ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
399
        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
400
        stmia r0, {r10-r11}
401
        add r0, r0, r2
402
        ldmia r1, {r4-r6}
403
        add r1, r1, r2
404
        pld [r1]
405
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
406
        subs r3, r3, #1
407
        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
408
        stmia r0, {r10-r11}
409
        add r0, r0, r2
410
        bne 6b
411
        ldmfd sp!, {r4-r11,pc}
412
        .align 8
413
3:
414
        ldmia r1, {r4-r6}
415
        add r1, r1, r2
416
        pld [r1]
417
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
418
6:      ldmia r1, {r7-r9}
419
        add r1, r1, r2
420
        pld [r1]
421
        ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
422
        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
423
        stmia r0, {r10-r11}
424
        add r0, r0, r2
425
        ldmia r1, {r4-r6}
426
        add r1, r1, r2
427
        pld [r1]
428
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
429
        subs r3, r3, #1
430
        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
431
        stmia r0, {r10-r11}
432
        add r0, r0, r2
433
        bne 6b
434
        ldmfd sp!, {r4-r11,pc}
435
        .align 8
436
4:
437
        ldmia r1, {r4-r6}
438
        add r1, r1, r2
439
        pld [r1]
440
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
441
6:      ldmia r1, {r7-r9}
442
        add r1, r1, r2
443
        pld [r1]
444
        ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
445
        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
446
        stmia r0, {r10-r11}
447
        add r0, r0, r2
448
        ldmia r1, {r4-r6}
449
        add r1, r1, r2
450
        pld [r1]
451
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
452
        subs r3, r3, #1
453
        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
454
        stmia r0, {r10-r11}
455
        add r0, r0, r2
456
        bne 6b
457
        ldmfd sp!, {r4-r11,pc}
458

  
459
        .align 8
460
5:
461
        .word 0xFEFEFEFE
462
        .word 2b
463
        .word 3b
464
        .word 4b
465
        .endfunc
466

  
467
        .align 8
468
function put_no_rnd_pixels8_y2_arm, export=1
469
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
470
        @ block = word aligned, pixles = unaligned
471
        pld [r1]
472
        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
473
        adr r5, 5f
474
        ands r4, r1, #3
475
        mov r3, r3, lsr #1
476
        ldr r12, [r5]
477
        add r5, r5, r4, lsl #2
478
        bic r1, r1, #3
479
        ldrne pc, [r5]
480
1:
481
        ldmia r1, {r4-r5}
482
        add r1, r1, r2
483
6:      ldmia r1, {r6-r7}
484
        add r1, r1, r2
485
        pld [r1]
486
        NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
487
        ldmia r1, {r4-r5}
488
        add r1, r1, r2
489
        stmia r0, {r8-r9}
490
        add r0, r0, r2
491
        pld [r1]
492
        NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
493
        subs r3, r3, #1
494
        stmia r0, {r8-r9}
495
        add r0, r0, r2
496
        bne 6b
497
        ldmfd sp!, {r4-r11,pc}
498
        .align 8
499
2:
500
        ldmia r1, {r4-r6}
501
        add r1, r1, r2
502
        pld [r1]
503
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
504
6:      ldmia r1, {r7-r9}
505
        add r1, r1, r2
506
        pld [r1]
507
        ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
508
        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
509
        stmia r0, {r10-r11}
510
        add r0, r0, r2
511
        ldmia r1, {r4-r6}
512
        add r1, r1, r2
513
        pld [r1]
514
        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
515
        subs r3, r3, #1
516
        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
517
        stmia r0, {r10-r11}
518
        add r0, r0, r2
519
        bne 6b
520
        ldmfd sp!, {r4-r11,pc}
521
        .align 8
522
3:
523
        ldmia r1, {r4-r6}
524
        add r1, r1, r2
525
        pld [r1]
526
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
527
6:      ldmia r1, {r7-r9}
528
        add r1, r1, r2
529
        pld [r1]
530
        ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
531
        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
532
        stmia r0, {r10-r11}
533
        add r0, r0, r2
534
        ldmia r1, {r4-r6}
535
        add r1, r1, r2
536
        pld [r1]
537
        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
538
        subs r3, r3, #1
539
        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
540
        stmia r0, {r10-r11}
541
        add r0, r0, r2
542
        bne 6b
543
        ldmfd sp!, {r4-r11,pc}
544
        .align 8
545
4:
546
        ldmia r1, {r4-r6}
547
        add r1, r1, r2
548
        pld [r1]
549
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
550
6:      ldmia r1, {r7-r9}
551
        add r1, r1, r2
552
        pld [r1]
553
        ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
554
        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
555
        stmia r0, {r10-r11}
556
        add r0, r0, r2
557
        ldmia r1, {r4-r6}
558
        add r1, r1, r2
559
        pld [r1]
560
        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
561
        subs r3, r3, #1
562
        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
563
        stmia r0, {r10-r11}
564
        add r0, r0, r2
565
        bne 6b
566
        ldmfd sp!, {r4-r11,pc}
567
        .align 8
568
5:
569
        .word 0xFEFEFEFE
570
        .word 2b
571
        .word 3b
572
        .word 4b
573
        .endfunc
574

  
575
@ ----------------------------------------------------------------
576
.macro  RND_XY2_IT align
577
        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
578
        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
579
.if \align == 0
580
        ldmia r1, {r6-r8}
581
.elseif \align == 3
582
        ldmia r1, {r5-r7}
583
.else
584
        ldmia r1, {r8-r10}
585
.endif
586
        add r1, r1, r2
587
        pld [r1]
588
.if \align == 0
589
        ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
590
.elseif \align == 1
591
        ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
592
        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
593
.elseif \align == 2
594
        ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
595
        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
596
.elseif \align == 3
597
        ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
598
.endif
599
        ldr r14, [r12, #0]      @ 0x03030303
600
        tst r3, #1
601
        and r8, r4, r14
602
        and r9, r5, r14
603
        and r10, r6, r14
604
        and r11, r7, r14
605
        ldreq r14, [r12, #16]   @ 0x02020202/0x01010101
606
        add r8, r8, r10
607
        add r9, r9, r11
608
        addeq r8, r8, r14
609
        addeq r9, r9, r14
610
        ldr r14, [r12, #20]     @ 0xFCFCFCFC >> 2
611
        and r4, r14, r4, lsr #2
612
        and r5, r14, r5, lsr #2
613
        and r6, r14, r6, lsr #2
614
        and r7, r14, r7, lsr #2
615
        add r10, r4, r6
616
        add r11, r5, r7
617
        subs r3, r3, #1
618
.endm
619

  
620
.macro RND_XY2_EXPAND align
621
        RND_XY2_IT \align
622
6:      stmfd sp!, {r8-r11}
623
        RND_XY2_IT \align
624
        ldmfd sp!, {r4-r7}
625
        add r4, r4, r8
626
        add r5, r5, r9
627
        add r6, r6, r10
628
        add r7, r7, r11
629
        ldr r14, [r12, #24]     @ 0x0F0F0F0F
630
        and r4, r14, r4, lsr #2
631
        and r5, r14, r5, lsr #2
632
        add r4, r4, r6
633
        add r5, r5, r7
634
        stmia r0, {r4-r5}
635
        add r0, r0, r2
636
        bge 6b
637
        ldmfd sp!, {r4-r11,pc}
638
.endm
639

  
640
        .align 8
641
function put_pixels8_xy2_arm, export=1
642
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
643
        @ block = word aligned, pixles = unaligned
644
        pld [r1]
645
        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
646
        adrl r12, 5f
647
        ands r4, r1, #3
648
        add r5, r12, r4, lsl #2
649
        bic r1, r1, #3
650
        ldrne pc, [r5]
651
1:
652
        RND_XY2_EXPAND 0
653

  
654
        .align 8
655
2:
656
        RND_XY2_EXPAND 1
657

  
658
        .align 8
659
3:
660
        RND_XY2_EXPAND 2
661

  
662
        .align 8
663
4:
664
        RND_XY2_EXPAND 3
665

  
666
5:
667
        .word 0x03030303
668
        .word 2b
669
        .word 3b
670
        .word 4b
671
        .word 0x02020202
672
        .word 0xFCFCFCFC >> 2
673
        .word 0x0F0F0F0F
674
        .endfunc
675

  
676
        .align 8
677
function put_no_rnd_pixels8_xy2_arm, export=1
678
        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
679
        @ block = word aligned, pixles = unaligned
680
        pld [r1]
681
        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
682
        adrl r12, 5f
683
        ands r4, r1, #3
684
        add r5, r12, r4, lsl #2
685
        bic r1, r1, #3
686
        ldrne pc, [r5]
687
1:
688
        RND_XY2_EXPAND 0
689

  
690
        .align 8
691
2:
692
        RND_XY2_EXPAND 1
693

  
694
        .align 8
695
3:
696
        RND_XY2_EXPAND 2
697

  
698
        .align 8
699
4:
700
        RND_XY2_EXPAND 3
701

  
702
5:
703
        .word 0x03030303
704
        .word 2b
705
        .word 3b
706
        .word 4b
707
        .word 0x01010101
708
        .word 0xFCFCFCFC >> 2
709
        .word 0x0F0F0F0F
710
        .endfunc
711

  
712
@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride)
713
function ff_add_pixels_clamped_ARM, export=1
714
        push            {r4-r10}
715
        mov             r10, #8
716
1:
717
        ldr             r4,  [r1]               /* load dest */
718
        /* block[0] and block[1]*/
719
        ldrsh           r5,  [r0]
720
        ldrsh           r7,  [r0, #2]
721
        and             r6,  r4,  #0xFF
722
        and             r8,  r4,  #0xFF00
723
        add             r6,  r5,  r6
724
        add             r8,  r7,  r8,  lsr #8
725
        mvn             r5,  r5
726
        mvn             r7,  r7
727
        tst             r6,  #0x100
728
        movne           r6,  r5,  lsr #24
729
        tst             r8,  #0x100
730
        movne           r8,  r7,  lsr #24
731
        mov             r9,  r6
732
        ldrsh           r5,  [r0, #4]           /* moved form [A] */
733
        orr             r9,  r9,  r8, lsl #8
734
        /* block[2] and block[3] */
735
        /* [A] */
736
        ldrsh           r7,  [r0, #6]
737
        and             r6,  r4,  #0xFF0000
738
        and             r8,  r4,  #0xFF000000
739
        add             r6,  r5,  r6, lsr #16
740
        add             r8,  r7,  r8, lsr #24
741
        mvn             r5,  r5
742
        mvn             r7,  r7
743
        tst             r6,  #0x100
744
        movne           r6,  r5,  lsr #24
745
        tst             r8,  #0x100
746
        movne           r8,  r7,  lsr #24
747
        orr             r9,  r9,  r6, lsl #16
748
        ldr             r4,  [r1, #4]           /* moved form [B] */
749
        orr             r9,  r9,  r8, lsl #24
750
        /* store dest */
751
        ldrsh           r5,  [r0, #8]           /* moved form [C] */
752
        str             r9,  [r1]
753

  
754
        /* load dest */
755
        /* [B] */
756
        /* block[4] and block[5] */
757
        /* [C] */
758
        ldrsh           r7,  [r0, #10]
759
        and             r6,  r4,  #0xFF
760
        and             r8,  r4,  #0xFF00
761
        add             r6,  r5,  r6
762
        add             r8,  r7,  r8, lsr #8
763
        mvn             r5,  r5
764
        mvn             r7,  r7
765
        tst             r6,  #0x100
766
        movne           r6,  r5,  lsr #24
767
        tst             r8,  #0x100
768
        movne           r8,  r7,  lsr #24
769
        mov             r9,  r6
770
        ldrsh           r5,  [r0, #12]          /* moved from [D] */
771
        orr             r9,  r9,  r8, lsl #8
772
        /* block[6] and block[7] */
773
        /* [D] */
774
        ldrsh           r7,  [r0, #14]
775
        and             r6,  r4,  #0xFF0000
776
        and             r8,  r4,  #0xFF000000
777
        add             r6,  r5,  r6, lsr #16
778
        add             r8,  r7,  r8, lsr #24
779
        mvn             r5,  r5
780
        mvn             r7,  r7
781
        tst             r6,  #0x100
782
        movne           r6,  r5,  lsr #24
783
        tst             r8,  #0x100
784
        movne           r8,  r7,  lsr #24
785
        orr             r9,  r9,  r6, lsl #16
786
        add             r0,  r0,  #16           /* moved from [E] */
787
        orr             r9,  r9,  r8, lsl #24
788
        subs            r10, r10, #1            /* moved from [F] */
789
        /* store dest */
790
        str             r9,  [r1, #4]
791

  
792
        /* [E] */
793
        /* [F] */
794
        add             r1,  r1,  r2
795
        bne             1b
796

  
797
        pop             {r4-r10}
798
        bx              lr
799
        .endfunc
libavcodec/arm/dsputil_iwmmxt.c
1
/*
2
 * iWMMXt optimized DSP utils
3
 * Copyright (c) 2004 AGAWA Koji
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

  
22
#include "libavcodec/dsputil.h"
23

  
24
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
25
#define SET_RND(regd)  __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
26
#define WAVG2B "wavg2b"
27
#include "dsputil_iwmmxt_rnd_template.c"
28
#undef DEF
29
#undef SET_RND
30
#undef WAVG2B
31

  
32
#define DEF(x, y) x ## _ ## y ##_iwmmxt
33
#define SET_RND(regd)  __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
34
#define WAVG2B "wavg2br"
35
#include "dsputil_iwmmxt_rnd_template.c"
36
#undef DEF
37
#undef SET_RND
38
#undef WAVG2BR
39

  
40
// need scheduling
41
#define OP(AVG)                                         \
42
    __asm__ volatile (                                      \
43
        /* alignment */                                 \
44
        "and r12, %[pixels], #7 \n\t"                   \
45
        "bic %[pixels], %[pixels], #7 \n\t"             \
46
        "tmcr wcgr1, r12 \n\t"                          \
47
                                                        \
48
        "wldrd wr0, [%[pixels]] \n\t"                   \
49
        "wldrd wr1, [%[pixels], #8] \n\t"               \
50
        "add %[pixels], %[pixels], %[line_size] \n\t"   \
51
        "walignr1 wr4, wr0, wr1 \n\t"                   \
52
                                                        \
53
        "1: \n\t"                                       \
54
                                                        \
55
        "wldrd wr2, [%[pixels]] \n\t"                   \
56
        "wldrd wr3, [%[pixels], #8] \n\t"               \
57
        "add %[pixels], %[pixels], %[line_size] \n\t"   \
58
        "pld [%[pixels]] \n\t"                          \
59
        "walignr1 wr5, wr2, wr3 \n\t"                   \
60
        AVG " wr6, wr4, wr5 \n\t"                       \
61
        "wstrd wr6, [%[block]] \n\t"                    \
62
        "add %[block], %[block], %[line_size] \n\t"     \
63
                                                        \
64
        "wldrd wr0, [%[pixels]] \n\t"                   \
65
        "wldrd wr1, [%[pixels], #8] \n\t"               \
66
        "add %[pixels], %[pixels], %[line_size] \n\t"   \
67
        "walignr1 wr4, wr0, wr1 \n\t"                   \
68
        "pld [%[pixels]] \n\t"                          \
69
        AVG " wr6, wr4, wr5 \n\t"                       \
70
        "wstrd wr6, [%[block]] \n\t"                    \
71
        "add %[block], %[block], %[line_size] \n\t"     \
72
                                                        \
73
        "subs %[h], %[h], #2 \n\t"                      \
74
        "bne 1b \n\t"                                   \
75
        : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)  \
76
        : [line_size]"r"(line_size) \
77
        : "memory", "r12");
78
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
79
{
80
    OP("wavg2br");
81
}
82
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
83
{
84
    OP("wavg2b");
85
}
86
#undef OP
87

  
88
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
89
{
90
    uint8_t *pixels2 = pixels + line_size;
91

  
92
    __asm__ volatile (
93
        "mov            r12, #4                 \n\t"
94
        "1:                                     \n\t"
95
        "pld            [%[pixels], %[line_size2]]              \n\t"
96
        "pld            [%[pixels2], %[line_size2]]             \n\t"
97
        "wldrd          wr4, [%[pixels]]        \n\t"
98
        "wldrd          wr5, [%[pixels2]]       \n\t"
99
        "pld            [%[block], #32]         \n\t"
100
        "wunpckelub     wr6, wr4                \n\t"
101
        "wldrd          wr0, [%[block]]         \n\t"
102
        "wunpckehub     wr7, wr4                \n\t"
103
        "wldrd          wr1, [%[block], #8]     \n\t"
104
        "wunpckelub     wr8, wr5                \n\t"
105
        "wldrd          wr2, [%[block], #16]    \n\t"
106
        "wunpckehub     wr9, wr5                \n\t"
107
        "wldrd          wr3, [%[block], #24]    \n\t"
108
        "add            %[block], %[block], #32 \n\t"
109
        "waddhss        wr10, wr0, wr6          \n\t"
110
        "waddhss        wr11, wr1, wr7          \n\t"
111
        "waddhss        wr12, wr2, wr8          \n\t"
112
        "waddhss        wr13, wr3, wr9          \n\t"
113
        "wpackhus       wr14, wr10, wr11        \n\t"
114
        "wpackhus       wr15, wr12, wr13        \n\t"
115
        "wstrd          wr14, [%[pixels]]       \n\t"
116
        "add            %[pixels], %[pixels], %[line_size2]     \n\t"
117
        "subs           r12, r12, #1            \n\t"
118
        "wstrd          wr15, [%[pixels2]]      \n\t"
119
        "add            %[pixels2], %[pixels2], %[line_size2]   \n\t"
120
        "bne            1b                      \n\t"
121
        : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
122
        : [line_size2]"r"(line_size << 1)
123
        : "cc", "memory", "r12");
124
}
125

  
126
static void clear_blocks_iwmmxt(DCTELEM *blocks)
127
{
128
    __asm__ volatile(
129
                "wzero wr0                      \n\t"
130
                "mov r1, #(128 * 6 / 32)        \n\t"
131
                "1:                             \n\t"
132
                "wstrd wr0, [%0]                \n\t"
133
                "wstrd wr0, [%0, #8]            \n\t"
134
                "wstrd wr0, [%0, #16]           \n\t"
135
                "wstrd wr0, [%0, #24]           \n\t"
136
                "subs r1, r1, #1                \n\t"
137
                "add %0, %0, #32                \n\t"
138
                "bne 1b                         \n\t"
139
                : "+r"(blocks)
140
                :
141
                : "r1"
142
        );
143
}
144

  
145
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
146
{
147
    return;
148
}
149

  
150
/* A run time test is not simple. If this file is compiled in
151
 * then we should install the functions
152
 */
153
int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */
154

  
155
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
156
{
157
    if (avctx->dsp_mask) {
158
        if (avctx->dsp_mask & FF_MM_FORCE)
159
            mm_flags |= (avctx->dsp_mask & 0xffff);
160
        else
161
            mm_flags &= ~(avctx->dsp_mask & 0xffff);
162
    }
163

  
164
    if (!(mm_flags & FF_MM_IWMMXT)) return;
165

  
166
    c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
167

  
168
    c->clear_blocks = clear_blocks_iwmmxt;
169

  
170
    c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
171
    c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
172
    c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
173
    c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
174
    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
175
    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
176
    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
177
    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
178

  
179
    c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
180
    c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
181
    c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
182
    c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
183
    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
184
    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
185
    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
186
    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
187

  
188
    c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
189
    c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
190
    c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
191
    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
192
    c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
193
    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
194
    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
195
    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
196

  
197
    c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
198
    c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
199
    c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
200
    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
201
    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
202
    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
203
    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
204
    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
205
}
libavcodec/arm/dsputil_iwmmxt_rnd_template.c
1
/*
2
 * iWMMXt optimized DSP utils
3
 * copyright (c) 2004 AGAWA Koji
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

  
22
void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
23
{
24
    int stride = line_size;
25
    __asm__ volatile (
26
        "and r12, %[pixels], #7 \n\t"
27
        "bic %[pixels], %[pixels], #7 \n\t"
28
        "tmcr wcgr1, r12 \n\t"
29
        "add r4, %[pixels], %[line_size] \n\t"
30
        "add r5, %[block], %[line_size] \n\t"
31
        "mov %[line_size], %[line_size], lsl #1 \n\t"
32
        "1: \n\t"
33
        "wldrd wr0, [%[pixels]] \n\t"
34
        "subs %[h], %[h], #2 \n\t"
35
        "wldrd wr1, [%[pixels], #8] \n\t"
36
        "add %[pixels], %[pixels], %[line_size] \n\t"
37
        "wldrd wr3, [r4] \n\t"
38
        "pld [%[pixels]] \n\t"
39
        "pld [%[pixels], #32] \n\t"
40
        "wldrd wr4, [r4, #8] \n\t"
41
        "add r4, r4, %[line_size] \n\t"
42
        "walignr1 wr8, wr0, wr1 \n\t"
43
        "pld [r4] \n\t"
44
        "pld [r4, #32] \n\t"
45
        "walignr1 wr10, wr3, wr4 \n\t"
46
        "wstrd wr8, [%[block]] \n\t"
47
        "add %[block], %[block], %[line_size] \n\t"
48
        "wstrd wr10, [r5] \n\t"
49
        "add r5, r5, %[line_size] \n\t"
50
        "bne 1b \n\t"
51
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
52
        :
53
        : "memory", "r4", "r5", "r12");
54
}
55

  
56
void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
57
{
58
    int stride = line_size;
59
    __asm__ volatile (
60
        "and r12, %[pixels], #7 \n\t"
61
        "bic %[pixels], %[pixels], #7 \n\t"
62
        "tmcr wcgr1, r12 \n\t"
63
        "add r4, %[pixels], %[line_size] \n\t"
64
        "add r5, %[block], %[line_size] \n\t"
65
        "mov %[line_size], %[line_size], lsl #1 \n\t"
66
        "1: \n\t"
67
        "wldrd wr0, [%[pixels]] \n\t"
68
        "subs %[h], %[h], #2 \n\t"
69
        "wldrd wr1, [%[pixels], #8] \n\t"
70
        "add %[pixels], %[pixels], %[line_size] \n\t"
71
        "wldrd wr3, [r4] \n\t"
72
        "pld [%[pixels]] \n\t"
73
        "pld [%[pixels], #32] \n\t"
74
        "wldrd wr4, [r4, #8] \n\t"
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff