Revision f2a30bd8 libavcodec/x86/vp8dsp.asm
libavcodec/x86/vp8dsp.asm | ||
---|---|---|
146 | 146 |
pw_17734: times 4 dw 17734 |
147 | 147 |
|
148 | 148 |
cextern pw_3 |
149 |
cextern pb_3 |
|
149 | 150 |
cextern pw_4 |
151 |
cextern pb_4 |
|
150 | 152 |
cextern pw_64 |
153 |
cextern pb_80 |
|
154 |
cextern pb_F8 |
|
155 |
cextern pb_FE |
|
151 | 156 |
|
152 | 157 |
SECTION .text |
153 | 158 |
|
... | ... | |
1063 | 1068 |
add r0, 2*16*4 |
1064 | 1069 |
SCATTER_WHT 3 |
1065 | 1070 |
RET |
1071 |
|
|
1072 |
;----------------------------------------------------------------------------- |
|
1073 |
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
|
1074 |
;----------------------------------------------------------------------------- |
|
1075 |
|
|
1076 |
; macro called with 7 mm register indexes as argument, and 4 regular registers |
|
1077 |
; |
|
1078 |
; first 4 mm registers will carry the transposed pixel data |
|
1079 |
; the other three are scratchspace (one would be sufficient, but this allows |
|
1080 |
; for more spreading/pipelining and thus faster execution on OOE CPUs) |
|
1081 |
; |
|
1082 |
; first two regular registers are buf+4*stride and buf+5*stride |
|
1083 |
; third is -stride, fourth is +stride |
|
1084 |
%macro READ_8x4_INTERLEAVED 11 |
|
1085 |
; interleave 8 (A-H) rows of 4 pixels each |
|
1086 |
movd m%1, [%8+%10*4] ; A0-3 |
|
1087 |
movd m%5, [%9+%10*4] ; B0-3 |
|
1088 |
movd m%2, [%8+%10*2] ; C0-3 |
|
1089 |
movd m%6, [%8+%10] ; D0-3 |
|
1090 |
movd m%3, [%8] ; E0-3 |
|
1091 |
movd m%7, [%9] ; F0-3 |
|
1092 |
movd m%4, [%9+%11] ; G0-3 |
|
1093 |
punpcklbw m%1, m%5 ; A/B interleaved |
|
1094 |
movd m%5, [%9+%11*2] ; H0-3 |
|
1095 |
punpcklbw m%2, m%6 ; C/D interleaved |
|
1096 |
punpcklbw m%3, m%7 ; E/F interleaved |
|
1097 |
punpcklbw m%4, m%5 ; G/H interleaved |
|
1098 |
%endmacro |
|
1099 |
|
|
1100 |
; macro called with 7 mm register indexes as argument, and 5 regular registers |
|
1101 |
; first 11 mean the same as READ_8x4_TRANSPOSED above |
|
1102 |
; fifth regular register is scratchspace to reach the bottom 8 rows, it |
|
1103 |
; will be set to second regular register + 8*stride at the end |
|
1104 |
%macro READ_16x4_INTERLEAVED 12 |
|
1105 |
; transpose 16 (A-P) rows of 4 pixels each |
|
1106 |
lea %12, [r0+8*r2] |
|
1107 |
|
|
1108 |
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
|
1109 |
movd m%1, [%8+%10*4] ; A0-3 |
|
1110 |
movd m%3, [%12+%10*4] ; I0-3 |
|
1111 |
movd m%2, [%8+%10*2] ; C0-3 |
|
1112 |
movd m%4, [%12+%10*2] ; K0-3 |
|
1113 |
movd m%6, [%8+%10] ; D0-3 |
|
1114 |
movd m%5, [%12+%10] ; L0-3 |
|
1115 |
movd m%7, [%12] ; M0-3 |
|
1116 |
add %12, %11 |
|
1117 |
punpcklbw m%1, m%3 ; A/I |
|
1118 |
movd m%3, [%8] ; E0-3 |
|
1119 |
punpcklbw m%2, m%4 ; C/K |
|
1120 |
punpcklbw m%6, m%5 ; D/L |
|
1121 |
punpcklbw m%3, m%7 ; E/M |
|
1122 |
punpcklbw m%2, m%6 ; C/D/K/L interleaved |
|
1123 |
|
|
1124 |
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
|
1125 |
movd m%5, [%9+%10*4] ; B0-3 |
|
1126 |
movd m%4, [%12+%10*4] ; J0-3 |
|
1127 |
movd m%7, [%9] ; F0-3 |
|
1128 |
movd m%6, [%12] ; N0-3 |
|
1129 |
punpcklbw m%5, m%4 ; B/J |
|
1130 |
punpcklbw m%7, m%6 ; F/N |
|
1131 |
punpcklbw m%1, m%5 ; A/B/I/J interleaved |
|
1132 |
punpcklbw m%3, m%7 ; E/F/M/N interleaved |
|
1133 |
movd m%4, [%9+%11] ; G0-3 |
|
1134 |
movd m%6, [%12+%11] ; O0-3 |
|
1135 |
movd m%5, [%9+%11*2] ; H0-3 |
|
1136 |
movd m%7, [%12+%11*2] ; P0-3 |
|
1137 |
punpcklbw m%4, m%6 ; G/O |
|
1138 |
punpcklbw m%5, m%7 ; H/P |
|
1139 |
punpcklbw m%4, m%5 ; G/H/O/P interleaved |
|
1140 |
%endmacro |
|
1141 |
|
|
1142 |
; write 4 mm registers of 2 dwords each |
|
1143 |
; first four arguments are mm register indexes containing source data |
|
1144 |
; last four are registers containing buf+4*stride, buf+5*stride, |
|
1145 |
; -stride and +stride |
|
1146 |
%macro WRITE_4x2D 8 |
|
1147 |
; write out (2 dwords per register) |
|
1148 |
movd [%5+%7*4], m%1 |
|
1149 |
movd [%5+%7*2], m%2 |
|
1150 |
movd [%5], m%3 |
|
1151 |
movd [%6+%8], m%4 |
|
1152 |
punpckhdq m%1, m%1 |
|
1153 |
punpckhdq m%2, m%2 |
|
1154 |
punpckhdq m%3, m%3 |
|
1155 |
punpckhdq m%4, m%4 |
|
1156 |
movd [%6+%7*4], m%1 |
|
1157 |
movd [%5+%7], m%2 |
|
1158 |
movd [%6], m%3 |
|
1159 |
movd [%6+%8*2], m%4 |
|
1160 |
%endmacro |
|
1161 |
|
|
1162 |
; write 4 xmm registers of 4 dwords each |
|
1163 |
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
|
1164 |
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
|
1165 |
; we add 1*stride to the third regular registry in the process |
|
1166 |
%macro WRITE_4x4D 9 |
|
1167 |
; write out (4 dwords per register), start with dwords zero |
|
1168 |
movd [%5+%8*4], m%1 |
|
1169 |
movd [%5], m%2 |
|
1170 |
movd [%5+%9*4], m%3 |
|
1171 |
movd [%5+%9*8], m%4 |
|
1172 |
|
|
1173 |
; store dwords 1 |
|
1174 |
psrldq m%1, 4 |
|
1175 |
psrldq m%2, 4 |
|
1176 |
psrldq m%3, 4 |
|
1177 |
psrldq m%4, 4 |
|
1178 |
movd [%6+%8*4], m%1 |
|
1179 |
movd [%6], m%2 |
|
1180 |
movd [%6+%9*4], m%3 |
|
1181 |
movd [%6+%9*8], m%4 |
|
1182 |
|
|
1183 |
; write dwords 2 |
|
1184 |
psrldq m%1, 4 |
|
1185 |
psrldq m%2, 4 |
|
1186 |
psrldq m%3, 4 |
|
1187 |
psrldq m%4, 4 |
|
1188 |
movd [%5+%8*2], m%1 |
|
1189 |
movd [%6+%9], m%2 |
|
1190 |
movd [%7+%8*2], m%3 |
|
1191 |
movd [%7+%9*2], m%4 |
|
1192 |
add %7, %9 |
|
1193 |
|
|
1194 |
; store dwords 3 |
|
1195 |
psrldq m%1, 4 |
|
1196 |
psrldq m%2, 4 |
|
1197 |
psrldq m%3, 4 |
|
1198 |
psrldq m%4, 4 |
|
1199 |
movd [%5+%8], m%1 |
|
1200 |
movd [%6+%9*2], m%2 |
|
1201 |
movd [%7+%8*2], m%3 |
|
1202 |
movd [%7+%9*2], m%4 |
|
1203 |
%endmacro |
|
1204 |
|
|
1205 |
%macro SIMPLE_LOOPFILTER 3 |
|
1206 |
cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
|
1207 |
%ifidn %2, h |
|
1208 |
mov r5, rsp ; backup stack pointer |
|
1209 |
and rsp, ~(mmsize-1) ; align stack |
|
1210 |
%endif |
|
1211 |
%if mmsize == 8 ; mmx/mmxext |
|
1212 |
mov r3, 2 |
|
1213 |
%endif |
|
1214 |
|
|
1215 |
; splat register with "flim" |
|
1216 |
movd m7, r2 |
|
1217 |
punpcklbw m7, m7 |
|
1218 |
%if mmsize == 16 ; sse2 |
|
1219 |
punpcklwd m7, m7 |
|
1220 |
pshufd m7, m7, 0x0 |
|
1221 |
%elifidn %1, mmx |
|
1222 |
punpcklwd m7, m7 |
|
1223 |
punpckldq m7, m7 |
|
1224 |
%else ; mmxext |
|
1225 |
pshufw m7, m7, 0x0 |
|
1226 |
%endif |
|
1227 |
|
|
1228 |
; set up indexes to address 4 rows |
|
1229 |
mov r2, r1 |
|
1230 |
neg r1 |
|
1231 |
%ifidn %2, h |
|
1232 |
lea r0, [r0+4*r2-2] |
|
1233 |
sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |
|
1234 |
%endif |
|
1235 |
|
|
1236 |
%if mmsize == 8 ; mmx / mmxext |
|
1237 |
.next8px |
|
1238 |
%endif |
|
1239 |
%ifidn %2, v |
|
1240 |
; read 4 half/full rows of pixels |
|
1241 |
mova m0, [r0+r1*2] ; p1 |
|
1242 |
mova m1, [r0+r1] ; p0 |
|
1243 |
mova m2, [r0] ; q0 |
|
1244 |
mova m3, [r0+r2] ; q1 |
|
1245 |
%else ; h |
|
1246 |
lea r4, [r0+r2] |
|
1247 |
|
|
1248 |
%if mmsize == 8 ; mmx/mmxext |
|
1249 |
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
|
1250 |
%else ; sse2 |
|
1251 |
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
|
1252 |
%endif |
|
1253 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
1254 |
|
|
1255 |
mova [rsp], m0 ; store p1 |
|
1256 |
mova [rsp+mmsize], m3 ; store q1 |
|
1257 |
%endif |
|
1258 |
|
|
1259 |
; simple_limit |
|
1260 |
mova m5, m2 ; m5=backup of q0 |
|
1261 |
mova m6, m1 ; m6=backup of p0 |
|
1262 |
psubusb m1, m2 ; p0-q0 |
|
1263 |
psubusb m2, m6 ; q0-p0 |
|
1264 |
por m1, m2 ; FFABS(p0-q0) |
|
1265 |
paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
|
1266 |
|
|
1267 |
mova m4, m3 |
|
1268 |
mova m2, m0 |
|
1269 |
psubusb m3, m0 ; q1-p1 |
|
1270 |
psubusb m0, m4 ; p1-q1 |
|
1271 |
por m3, m0 ; FFABS(p1-q1) |
|
1272 |
mova m0, [pb_80] |
|
1273 |
pxor m2, m0 |
|
1274 |
pxor m4, m0 |
|
1275 |
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
|
1276 |
pand m3, [pb_FE] |
|
1277 |
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
|
1278 |
paddusb m3, m1 |
|
1279 |
psubusb m3, m7 |
|
1280 |
pxor m1, m1 |
|
1281 |
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
|
1282 |
|
|
1283 |
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
|
1284 |
mova m4, m5 |
|
1285 |
pxor m5, m0 |
|
1286 |
pxor m0, m6 |
|
1287 |
psubsb m5, m0 ; q0-p0 (signed) |
|
1288 |
paddsb m2, m5 |
|
1289 |
paddsb m2, m5 |
|
1290 |
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
|
1291 |
pand m2, m3 ; apply filter mask (m3) |
|
1292 |
|
|
1293 |
mova m3, [pb_F8] |
|
1294 |
mova m1, m2 |
|
1295 |
paddsb m2, [pb_4] ; f1<<3=a+4 |
|
1296 |
paddsb m1, [pb_3] ; f2<<3=a+3 |
|
1297 |
pand m2, m3 |
|
1298 |
pand m1, m3 ; cache f2<<3 |
|
1299 |
|
|
1300 |
pxor m0, m0 |
|
1301 |
pxor m3, m3 |
|
1302 |
pcmpgtb m0, m2 ; which values are <0? |
|
1303 |
psubb m3, m2 ; -f1<<3 |
|
1304 |
psrlq m2, 3 ; +f1 |
|
1305 |
psrlq m3, 3 ; -f1 |
|
1306 |
pand m3, m0 |
|
1307 |
pandn m0, m2 |
|
1308 |
psubusb m4, m0 |
|
1309 |
paddusb m4, m3 ; q0-f1 |
|
1310 |
|
|
1311 |
pxor m0, m0 |
|
1312 |
pxor m3, m3 |
|
1313 |
pcmpgtb m0, m1 ; which values are <0? |
|
1314 |
psubb m3, m1 ; -f2<<3 |
|
1315 |
psrlq m1, 3 ; +f2 |
|
1316 |
psrlq m3, 3 ; -f2 |
|
1317 |
pand m3, m0 |
|
1318 |
pandn m0, m1 |
|
1319 |
paddusb m6, m0 |
|
1320 |
psubusb m6, m3 ; p0+f2 |
|
1321 |
|
|
1322 |
; store |
|
1323 |
%ifidn %2, v |
|
1324 |
mova [r0], m4 |
|
1325 |
mova [r0+r1], m6 |
|
1326 |
%else ; h |
|
1327 |
mova m0, [rsp] ; p1 |
|
1328 |
SWAP 2, 4 ; p0 |
|
1329 |
SWAP 1, 6 ; q0 |
|
1330 |
mova m3, [rsp+mmsize] ; q1 |
|
1331 |
|
|
1332 |
TRANSPOSE4x4B 0, 1, 2, 3, 4 |
|
1333 |
%if mmsize == 16 ; sse2 |
|
1334 |
add r3, r1 ; change from r4*8*stride to r0+8*stride |
|
1335 |
WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2 |
|
1336 |
%else ; mmx/mmxext |
|
1337 |
WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |
|
1338 |
%endif |
|
1339 |
%endif |
|
1340 |
|
|
1341 |
%if mmsize == 8 ; mmx/mmxext |
|
1342 |
; next 8 pixels |
|
1343 |
%ifidn %2, v |
|
1344 |
add r0, 8 ; advance 8 cols = pixels |
|
1345 |
%else ; h |
|
1346 |
lea r0, [r0+r2*8] ; advance 8 rows = lines |
|
1347 |
%endif |
|
1348 |
dec r3 |
|
1349 |
jg .next8px |
|
1350 |
%ifidn %2, v |
|
1351 |
REP_RET |
|
1352 |
%else ; h |
|
1353 |
mov rsp, r5 ; restore stack pointer |
|
1354 |
RET |
|
1355 |
%endif |
|
1356 |
%else ; sse2 |
|
1357 |
%ifidn %2, h |
|
1358 |
mov rsp, r5 ; restore stack pointer |
|
1359 |
%endif |
|
1360 |
RET |
|
1361 |
%endif |
|
1362 |
%endmacro |
|
1363 |
|
|
1364 |
INIT_MMX |
|
1365 |
SIMPLE_LOOPFILTER mmx, v, 4 |
|
1366 |
SIMPLE_LOOPFILTER mmx, h, 6 |
|
1367 |
SIMPLE_LOOPFILTER mmxext, v, 4 |
|
1368 |
SIMPLE_LOOPFILTER mmxext, h, 6 |
|
1369 |
INIT_XMM |
|
1370 |
SIMPLE_LOOPFILTER sse2, v, 3 |
|
1371 |
SIMPLE_LOOPFILTER sse2, h, 6 |
Also available in: Unified diff