ffmpeg / libavcodec / arm / dsputil_vfp.S @ a2fc0f6a
History | View | Annotate | Download (7.01 KB)
1 | 83ad74e7 | Måns Rullgård | /* |
---|---|---|---|
2 | * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> |
||
3 | * |
||
4 | * This file is part of FFmpeg. |
||
5 | * |
||
6 | * FFmpeg is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public |
||
8 | * License as published by the Free Software Foundation; either |
||
9 | * version 2.1 of the License, or (at your option) any later version. |
||
10 | * |
||
11 | * FFmpeg is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with FFmpeg; if not, write to the Free Software |
||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | */ |
||
20 | |||
21 | #include "config.h" |
||
22 | #include "asm.S" |
||
23 | |||
24 | b0e8ce55 | Måns Rullgård | .fpu neon @ required for gas to accept UAL syntax |
25 | 83ad74e7 | Måns Rullgård | /* |
26 | * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle |
||
27 | * throughput for almost all the instructions (except for double precision |
||
28 | * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles |
||
29 | * for arithmetic operations. Scheduling code to avoid pipeline stalls is very |
||
30 | * important for performance. One more interesting feature is that VFP has |
||
31 | * independent load/store and arithmetics pipelines, so it is possible to make |
||
32 | * them work simultaneously and get more than 1 operation per cycle. Load/store |
||
33 | * pipeline can process 2 single precision floating point values per cycle and |
||
34 | * supports bulk loads and stores for large sets of registers. Arithmetic operations |
||
35 | * can be done on vectors, which allows to keep the arithmetics pipeline busy, |
||
36 | * while the processor may issue and execute other instructions. Detailed |
||
37 | * optimization manuals can be found at http://www.arm.com |
||
38 | */ |
||
39 | |||
40 | /** |
||
41 | * ARM VFP optimized implementation of 'vector_fmul_c' function. |
||
42 | * Assume that len is a positive number and is multiple of 8 |
||
43 | */ |
||
44 | @ void ff_vector_fmul_vfp(float *dst, const float *src, int len) |
||
45 | function ff_vector_fmul_vfp, export=1 |
||
46 | vpush {d8-d15} |
||
47 | mov r3, r0 |
||
48 | fmrx r12, fpscr |
||
49 | orr r12, r12, #(3 << 16) /* set vector size to 4 */ |
||
50 | fmxr fpscr, r12 |
||
51 | |||
52 | b0e8ce55 | Måns Rullgård | vldmia r3!, {s0-s3} |
53 | vldmia r1!, {s8-s11} |
||
54 | vldmia r3!, {s4-s7} |
||
55 | vldmia r1!, {s12-s15} |
||
56 | vmul.f32 s8, s0, s8 |
||
57 | 83ad74e7 | Måns Rullgård | 1: |
58 | subs r2, r2, #16 |
||
59 | b0e8ce55 | Måns Rullgård | vmul.f32 s12, s4, s12 |
60 | vldmiage r3!, {s16-s19} |
||
61 | vldmiage r1!, {s24-s27} |
||
62 | vldmiage r3!, {s20-s23} |
||
63 | vldmiage r1!, {s28-s31} |
||
64 | vmulge.f32 s24, s16, s24 |
||
65 | vstmia r0!, {s8-s11} |
||
66 | vstmia r0!, {s12-s15} |
||
67 | vmulge.f32 s28, s20, s28 |
||
68 | vldmiagt r3!, {s0-s3} |
||
69 | vldmiagt r1!, {s8-s11} |
||
70 | vldmiagt r3!, {s4-s7} |
||
71 | vldmiagt r1!, {s12-s15} |
||
72 | vmulge.f32 s8, s0, s8 |
||
73 | vstmiage r0!, {s24-s27} |
||
74 | vstmiage r0!, {s28-s31} |
||
75 | 83ad74e7 | Måns Rullgård | bgt 1b |
76 | |||
77 | bic r12, r12, #(7 << 16) /* set vector size back to 1 */ |
||
78 | fmxr fpscr, r12 |
||
79 | vpop {d8-d15} |
||
80 | bx lr |
||
81 | .endfunc |
||
82 | |||
83 | /** |
||
84 | * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. |
||
85 | * Assume that len is a positive number and is multiple of 8 |
||
86 | */ |
||
87 | @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, |
||
88 | @ const float *src1, int len) |
||
89 | function ff_vector_fmul_reverse_vfp, export=1 |
||
90 | vpush {d8-d15} |
||
91 | add r2, r2, r3, lsl #2 |
||
92 | b0e8ce55 | Måns Rullgård | vldmdb r2!, {s0-s3} |
93 | vldmia r1!, {s8-s11} |
||
94 | vldmdb r2!, {s4-s7} |
||
95 | vldmia r1!, {s12-s15} |
||
96 | vmul.f32 s8, s3, s8 |
||
97 | vmul.f32 s9, s2, s9 |
||
98 | vmul.f32 s10, s1, s10 |
||
99 | vmul.f32 s11, s0, s11 |
||
100 | 83ad74e7 | Måns Rullgård | 1: |
101 | subs r3, r3, #16 |
||
102 | b0e8ce55 | Måns Rullgård | vldmdbge r2!, {s16-s19} |
103 | vmul.f32 s12, s7, s12 |
||
104 | vldmiage r1!, {s24-s27} |
||
105 | vmul.f32 s13, s6, s13 |
||
106 | vldmdbge r2!, {s20-s23} |
||
107 | vmul.f32 s14, s5, s14 |
||
108 | vldmiage r1!, {s28-s31} |
||
109 | vmul.f32 s15, s4, s15 |
||
110 | vmulge.f32 s24, s19, s24 |
||
111 | vldmdbgt r2!, {s0-s3} |
||
112 | vmulge.f32 s25, s18, s25 |
||
113 | vstmia r0!, {s8-s13} |
||
114 | vmulge.f32 s26, s17, s26 |
||
115 | vldmiagt r1!, {s8-s11} |
||
116 | vmulge.f32 s27, s16, s27 |
||
117 | vmulge.f32 s28, s23, s28 |
||
118 | vldmdbgt r2!, {s4-s7} |
||
119 | vmulge.f32 s29, s22, s29 |
||
120 | vstmia r0!, {s14-s15} |
||
121 | vmulge.f32 s30, s21, s30 |
||
122 | vmulge.f32 s31, s20, s31 |
||
123 | vmulge.f32 s8, s3, s8 |
||
124 | vldmiagt r1!, {s12-s15} |
||
125 | vmulge.f32 s9, s2, s9 |
||
126 | vmulge.f32 s10, s1, s10 |
||
127 | vstmiage r0!, {s24-s27} |
||
128 | vmulge.f32 s11, s0, s11 |
||
129 | vstmiage r0!, {s28-s31} |
||
130 | 83ad74e7 | Måns Rullgård | bgt 1b |
131 | |||
132 | vpop {d8-d15} |
||
133 | bx lr |
||
134 | .endfunc |
||
135 | |||
136 | #ifdef HAVE_ARMV6 |
||
137 | /** |
||
138 | * ARM VFP optimized float to int16 conversion. |
||
139 | * Assume that len is a positive number and is multiple of 8, destination |
||
140 | * buffer is at least 4 bytes aligned (8 bytes alignment is better for |
||
141 | * performance), little endian byte sex |
||
142 | */ |
||
143 | @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) |
||
144 | function ff_float_to_int16_vfp, export=1 |
||
145 | push {r4-r8,lr} |
||
146 | vpush {d8-d11} |
||
147 | b0e8ce55 | Måns Rullgård | vldmia r1!, {s16-s23} |
148 | vcvt.s32.f32 s0, s16 |
||
149 | vcvt.s32.f32 s1, s17 |
||
150 | vcvt.s32.f32 s2, s18 |
||
151 | vcvt.s32.f32 s3, s19 |
||
152 | vcvt.s32.f32 s4, s20 |
||
153 | vcvt.s32.f32 s5, s21 |
||
154 | vcvt.s32.f32 s6, s22 |
||
155 | vcvt.s32.f32 s7, s23 |
||
156 | 83ad74e7 | Måns Rullgård | 1: |
157 | subs r2, r2, #8 |
||
158 | b0e8ce55 | Måns Rullgård | vmov r3, r4, s0, s1 |
159 | vmov r5, r6, s2, s3 |
||
160 | vmov r7, r8, s4, s5 |
||
161 | vmov ip, lr, s6, s7 |
||
162 | vldmiagt r1!, {s16-s23} |
||
163 | 83ad74e7 | Måns Rullgård | ssat r4, #16, r4 |
164 | ssat r3, #16, r3 |
||
165 | ssat r6, #16, r6 |
||
166 | ssat r5, #16, r5 |
||
167 | pkhbt r3, r3, r4, lsl #16 |
||
168 | pkhbt r4, r5, r6, lsl #16 |
||
169 | b0e8ce55 | Måns Rullgård | vcvtgt.s32.f32 s0, s16 |
170 | vcvtgt.s32.f32 s1, s17 |
||
171 | vcvtgt.s32.f32 s2, s18 |
||
172 | vcvtgt.s32.f32 s3, s19 |
||
173 | vcvtgt.s32.f32 s4, s20 |
||
174 | vcvtgt.s32.f32 s5, s21 |
||
175 | vcvtgt.s32.f32 s6, s22 |
||
176 | vcvtgt.s32.f32 s7, s23 |
||
177 | 83ad74e7 | Måns Rullgård | ssat r8, #16, r8 |
178 | ssat r7, #16, r7 |
||
179 | ssat lr, #16, lr |
||
180 | ssat ip, #16, ip |
||
181 | pkhbt r5, r7, r8, lsl #16 |
||
182 | pkhbt r6, ip, lr, lsl #16 |
||
183 | stmia r0!, {r3-r6} |
||
184 | bgt 1b |
||
185 | |||
186 | vpop {d8-d11} |
||
187 | pop {r4-r8,pc} |
||
188 | .endfunc |
||
189 | #endif |