ffmpeg / libavcodec / arm / dsputil_iwmmxt.c @ 8dbe5856
History | View | Annotate | Download (8.86 KB)
1 |
/*
|
---|---|
2 |
* iWMMXt optimized DSP utils
|
3 |
* Copyright (c) 2004 AGAWA Koji
|
4 |
*
|
5 |
* This file is part of FFmpeg.
|
6 |
*
|
7 |
* FFmpeg is free software; you can redistribute it and/or
|
8 |
* modify it under the terms of the GNU Lesser General Public
|
9 |
* License as published by the Free Software Foundation; either
|
10 |
* version 2.1 of the License, or (at your option) any later version.
|
11 |
*
|
12 |
* FFmpeg is distributed in the hope that it will be useful,
|
13 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
15 |
* Lesser General Public License for more details.
|
16 |
*
|
17 |
* You should have received a copy of the GNU Lesser General Public
|
18 |
* License along with FFmpeg; if not, write to the Free Software
|
19 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
20 |
*/
|
21 |
|
22 |
#include "libavutil/cpu.h" |
23 |
#include "libavcodec/dsputil.h" |
24 |
|
25 |
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt |
26 |
#define SET_RND(regd) __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); |
27 |
#define WAVG2B "wavg2b" |
28 |
#include "dsputil_iwmmxt_rnd_template.c" |
29 |
#undef DEF
|
30 |
#undef SET_RND
|
31 |
#undef WAVG2B
|
32 |
|
33 |
#define DEF(x, y) x ## _ ## y ##_iwmmxt |
34 |
#define SET_RND(regd) __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); |
35 |
#define WAVG2B "wavg2br" |
36 |
#include "dsputil_iwmmxt_rnd_template.c" |
37 |
#undef DEF
|
38 |
#undef SET_RND
|
39 |
#undef WAVG2BR
|
40 |
|
41 |
// need scheduling
|
42 |
#define OP(AVG) \
|
43 |
__asm__ volatile ( \
|
44 |
/* alignment */ \
|
45 |
"and r12, %[pixels], #7 \n\t" \
|
46 |
"bic %[pixels], %[pixels], #7 \n\t" \
|
47 |
"tmcr wcgr1, r12 \n\t" \
|
48 |
\ |
49 |
"wldrd wr0, [%[pixels]] \n\t" \
|
50 |
"wldrd wr1, [%[pixels], #8] \n\t" \
|
51 |
"add %[pixels], %[pixels], %[line_size] \n\t" \
|
52 |
"walignr1 wr4, wr0, wr1 \n\t" \
|
53 |
\ |
54 |
"1: \n\t" \
|
55 |
\ |
56 |
"wldrd wr2, [%[pixels]] \n\t" \
|
57 |
"wldrd wr3, [%[pixels], #8] \n\t" \
|
58 |
"add %[pixels], %[pixels], %[line_size] \n\t" \
|
59 |
"pld [%[pixels]] \n\t" \
|
60 |
"walignr1 wr5, wr2, wr3 \n\t" \
|
61 |
AVG " wr6, wr4, wr5 \n\t" \
|
62 |
"wstrd wr6, [%[block]] \n\t" \
|
63 |
"add %[block], %[block], %[line_size] \n\t" \
|
64 |
\ |
65 |
"wldrd wr0, [%[pixels]] \n\t" \
|
66 |
"wldrd wr1, [%[pixels], #8] \n\t" \
|
67 |
"add %[pixels], %[pixels], %[line_size] \n\t" \
|
68 |
"walignr1 wr4, wr0, wr1 \n\t" \
|
69 |
"pld [%[pixels]] \n\t" \
|
70 |
AVG " wr6, wr4, wr5 \n\t" \
|
71 |
"wstrd wr6, [%[block]] \n\t" \
|
72 |
"add %[block], %[block], %[line_size] \n\t" \
|
73 |
\ |
74 |
"subs %[h], %[h], #2 \n\t" \
|
75 |
"bne 1b \n\t" \
|
76 |
: [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ |
77 |
: [line_size]"r"(line_size) \
|
78 |
: "memory", "r12"); |
79 |
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) |
80 |
{ |
81 |
OP("wavg2br");
|
82 |
} |
83 |
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) |
84 |
{ |
85 |
OP("wavg2b");
|
86 |
} |
87 |
#undef OP
|
88 |
|
89 |
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) |
90 |
{ |
91 |
uint8_t *pixels2 = pixels + line_size; |
92 |
|
93 |
__asm__ volatile (
|
94 |
"mov r12, #4 \n\t"
|
95 |
"1: \n\t"
|
96 |
"pld [%[pixels], %[line_size2]] \n\t"
|
97 |
"pld [%[pixels2], %[line_size2]] \n\t"
|
98 |
"wldrd wr4, [%[pixels]] \n\t"
|
99 |
"wldrd wr5, [%[pixels2]] \n\t"
|
100 |
"pld [%[block], #32] \n\t"
|
101 |
"wunpckelub wr6, wr4 \n\t"
|
102 |
"wldrd wr0, [%[block]] \n\t"
|
103 |
"wunpckehub wr7, wr4 \n\t"
|
104 |
"wldrd wr1, [%[block], #8] \n\t"
|
105 |
"wunpckelub wr8, wr5 \n\t"
|
106 |
"wldrd wr2, [%[block], #16] \n\t"
|
107 |
"wunpckehub wr9, wr5 \n\t"
|
108 |
"wldrd wr3, [%[block], #24] \n\t"
|
109 |
"add %[block], %[block], #32 \n\t"
|
110 |
"waddhss wr10, wr0, wr6 \n\t"
|
111 |
"waddhss wr11, wr1, wr7 \n\t"
|
112 |
"waddhss wr12, wr2, wr8 \n\t"
|
113 |
"waddhss wr13, wr3, wr9 \n\t"
|
114 |
"wpackhus wr14, wr10, wr11 \n\t"
|
115 |
"wpackhus wr15, wr12, wr13 \n\t"
|
116 |
"wstrd wr14, [%[pixels]] \n\t"
|
117 |
"add %[pixels], %[pixels], %[line_size2] \n\t"
|
118 |
"subs r12, r12, #1 \n\t"
|
119 |
"wstrd wr15, [%[pixels2]] \n\t"
|
120 |
"add %[pixels2], %[pixels2], %[line_size2] \n\t"
|
121 |
"bne 1b \n\t"
|
122 |
: [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) |
123 |
: [line_size2]"r"(line_size << 1) |
124 |
: "cc", "memory", "r12"); |
125 |
} |
126 |
|
127 |
static void clear_blocks_iwmmxt(DCTELEM *blocks) |
128 |
{ |
129 |
__asm__ volatile(
|
130 |
"wzero wr0 \n\t"
|
131 |
"mov r1, #(128 * 6 / 32) \n\t"
|
132 |
"1: \n\t"
|
133 |
"wstrd wr0, [%0] \n\t"
|
134 |
"wstrd wr0, [%0, #8] \n\t"
|
135 |
"wstrd wr0, [%0, #16] \n\t"
|
136 |
"wstrd wr0, [%0, #24] \n\t"
|
137 |
"subs r1, r1, #1 \n\t"
|
138 |
"add %0, %0, #32 \n\t"
|
139 |
"bne 1b \n\t"
|
140 |
: "+r"(blocks)
|
141 |
: |
142 |
: "r1"
|
143 |
); |
144 |
} |
145 |
|
146 |
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
147 |
{ |
148 |
return;
|
149 |
} |
150 |
|
151 |
/* A run time test is not simple. If this file is compiled in
|
152 |
* then we should install the functions
|
153 |
*/
|
154 |
|
155 |
void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
|
156 |
{ |
157 |
int mm_flags = AV_CPU_FLAG_IWMMXT; /* multimedia extension flags */ |
158 |
const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8; |
159 |
|
160 |
if (avctx->dsp_mask) {
|
161 |
if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
|
162 |
mm_flags |= (avctx->dsp_mask & 0xffff);
|
163 |
else
|
164 |
mm_flags &= ~(avctx->dsp_mask & 0xffff);
|
165 |
} |
166 |
|
167 |
if (!(mm_flags & AV_CPU_FLAG_IWMMXT)) return; |
168 |
|
169 |
c->add_pixels_clamped = add_pixels_clamped_iwmmxt; |
170 |
|
171 |
if (!h264_high_depth) {
|
172 |
c->clear_blocks = clear_blocks_iwmmxt; |
173 |
|
174 |
c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; |
175 |
c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; |
176 |
c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; |
177 |
c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; |
178 |
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; |
179 |
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; |
180 |
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; |
181 |
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; |
182 |
|
183 |
c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; |
184 |
c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; |
185 |
c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; |
186 |
c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; |
187 |
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; |
188 |
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; |
189 |
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; |
190 |
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; |
191 |
|
192 |
c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; |
193 |
c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; |
194 |
c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; |
195 |
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; |
196 |
c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; |
197 |
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; |
198 |
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; |
199 |
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; |
200 |
|
201 |
c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; |
202 |
c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; |
203 |
c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; |
204 |
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; |
205 |
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; |
206 |
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; |
207 |
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; |
208 |
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; |
209 |
} |
210 |
} |