Revision f2a30bd8

View differences:

libavcodec/x86/dsputil_mmx.c
63 63

  
64 64
DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
65 65
DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3  ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
66
DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_4  ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
66 67
DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
67 68
DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
68 69
DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
70
DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
69 71
DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
70 72
DECLARE_ALIGNED(8,  const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
73
DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
71 74
DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
75
DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
72 76

  
73 77
DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
74 78
DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
libavcodec/x86/vp8dsp-init.c
222 222
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
223 223
extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
224 224
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
225

  
226
extern void ff_vp8_v_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
227
extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
228
extern void ff_vp8_v_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
229
extern void ff_vp8_h_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
230
extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
231
extern void ff_vp8_h_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
225 232
#endif
226 233

  
227 234
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
......
260 267
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
261 268
        c->put_vp8_epel_pixels_tab[1][0][0]     =
262 269
        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
270

  
271
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
272
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
263 273
    }
264 274

  
265 275
    /* note that 4-tap width=16 functions are missing because w=16
......
272 282
        VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
273 283
        VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
274 284
        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
285

  
286
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
287
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
275 288
    }
276 289

  
277 290
    if (mm_flags & FF_MM_SSE) {
......
284 297
        VP8_MC_FUNC(1, 8, sse2);
285 298
        VP8_BILINEAR_MC_FUNC(0, 16, sse2);
286 299
        VP8_BILINEAR_MC_FUNC(1, 8, sse2);
300

  
301
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
302
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
287 303
    }
288 304

  
289 305
    if (mm_flags & FF_MM_SSSE3) {
libavcodec/x86/vp8dsp.asm
146 146
pw_17734: times 4 dw 17734
147 147

  
148 148
cextern pw_3
149
cextern pb_3
149 150
cextern pw_4
151
cextern pb_4
150 152
cextern pw_64
153
cextern pb_80
154
cextern pb_F8
155
cextern pb_FE
151 156

  
152 157
SECTION .text
153 158

  
......
1063 1068
    add           r0, 2*16*4
1064 1069
    SCATTER_WHT   3
1065 1070
    RET
1071

  
1072
;-----------------------------------------------------------------------------
1073
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1074
;-----------------------------------------------------------------------------
1075

  
1076
; macro called with 7 mm register indexes as argument, and 4 regular registers
1077
;
1078
; first 4 mm registers will carry the transposed pixel data
1079
; the other three are scratchspace (one would be sufficient, but this allows
1080
; for more spreading/pipelining and thus faster execution on OOE CPUs)
1081
;
1082
; first two regular registers are buf+4*stride and buf+5*stride
1083
; third is -stride, fourth is +stride
1084
%macro READ_8x4_INTERLEAVED 11
1085
    ; interleave 8 (A-H) rows of 4 pixels each
1086
    movd          m%1, [%8+%10*4]   ; A0-3
1087
    movd          m%5, [%9+%10*4]   ; B0-3
1088
    movd          m%2, [%8+%10*2]   ; C0-3
1089
    movd          m%6, [%8+%10]     ; D0-3
1090
    movd          m%3, [%8]         ; E0-3
1091
    movd          m%7, [%9]         ; F0-3
1092
    movd          m%4, [%9+%11]     ; G0-3
1093
    punpcklbw     m%1, m%5          ; A/B interleaved
1094
    movd          m%5, [%9+%11*2]   ; H0-3
1095
    punpcklbw     m%2, m%6          ; C/D interleaved
1096
    punpcklbw     m%3, m%7          ; E/F interleaved
1097
    punpcklbw     m%4, m%5          ; G/H interleaved
1098
%endmacro
1099

  
1100
; macro called with 7 mm register indexes as argument, and 5 regular registers
1101
; first 11 mean the same as READ_8x4_TRANSPOSED above
1102
; fifth regular register is scratchspace to reach the bottom 8 rows, it
1103
; will be set to second regular register + 8*stride at the end
1104
%macro READ_16x4_INTERLEAVED 12
1105
    ; transpose 16 (A-P) rows of 4 pixels each
1106
    lea           %12, [r0+8*r2]
1107

  
1108
    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1109
    movd          m%1, [%8+%10*4]   ; A0-3
1110
    movd          m%3, [%12+%10*4]  ; I0-3
1111
    movd          m%2, [%8+%10*2]   ; C0-3
1112
    movd          m%4, [%12+%10*2]  ; K0-3
1113
    movd          m%6, [%8+%10]     ; D0-3
1114
    movd          m%5, [%12+%10]    ; L0-3
1115
    movd          m%7, [%12]        ; M0-3
1116
    add           %12, %11
1117
    punpcklbw     m%1, m%3          ; A/I
1118
    movd          m%3, [%8]         ; E0-3
1119
    punpcklbw     m%2, m%4          ; C/K
1120
    punpcklbw     m%6, m%5          ; D/L
1121
    punpcklbw     m%3, m%7          ; E/M
1122
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved
1123

  
1124
    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1125
    movd         m%5, [%9+%10*4]   ; B0-3
1126
    movd         m%4, [%12+%10*4]  ; J0-3
1127
    movd         m%7, [%9]         ; F0-3
1128
    movd         m%6, [%12]        ; N0-3
1129
    punpcklbw    m%5, m%4          ; B/J
1130
    punpcklbw    m%7, m%6          ; F/N
1131
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
1132
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
1133
    movd         m%4, [%9+%11]     ; G0-3
1134
    movd         m%6, [%12+%11]    ; O0-3
1135
    movd         m%5, [%9+%11*2]   ; H0-3
1136
    movd         m%7, [%12+%11*2]  ; P0-3
1137
    punpcklbw    m%4, m%6          ; G/O
1138
    punpcklbw    m%5, m%7          ; H/P
1139
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
1140
%endmacro
1141

  
1142
; write 4 mm registers of 2 dwords each
1143
; first four arguments are mm register indexes containing source data
1144
; last four are registers containing buf+4*stride, buf+5*stride,
1145
; -stride and +stride
1146
%macro WRITE_4x2D 8
1147
    ; write out (2 dwords per register)
1148
    movd    [%5+%7*4], m%1
1149
    movd    [%5+%7*2], m%2
1150
    movd         [%5], m%3
1151
    movd      [%6+%8], m%4
1152
    punpckhdq     m%1, m%1
1153
    punpckhdq     m%2, m%2
1154
    punpckhdq     m%3, m%3
1155
    punpckhdq     m%4, m%4
1156
    movd    [%6+%7*4], m%1
1157
    movd      [%5+%7], m%2
1158
    movd         [%6], m%3
1159
    movd    [%6+%8*2], m%4
1160
%endmacro
1161

  
1162
; write 4 xmm registers of 4 dwords each
1163
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1164
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1165
; we add 1*stride to the third regular registry in the process
1166
%macro WRITE_4x4D 9
1167
    ; write out (4 dwords per register), start with dwords zero
1168
    movd    [%5+%8*4], m%1
1169
    movd         [%5], m%2
1170
    movd    [%5+%9*4], m%3
1171
    movd    [%5+%9*8], m%4
1172

  
1173
    ; store dwords 1
1174
    psrldq        m%1, 4
1175
    psrldq        m%2, 4
1176
    psrldq        m%3, 4
1177
    psrldq        m%4, 4
1178
    movd    [%6+%8*4], m%1
1179
    movd         [%6], m%2
1180
    movd    [%6+%9*4], m%3
1181
    movd    [%6+%9*8], m%4
1182

  
1183
    ; write dwords 2
1184
    psrldq        m%1, 4
1185
    psrldq        m%2, 4
1186
    psrldq        m%3, 4
1187
    psrldq        m%4, 4
1188
    movd    [%5+%8*2], m%1
1189
    movd      [%6+%9], m%2
1190
    movd    [%7+%8*2], m%3
1191
    movd    [%7+%9*2], m%4
1192
    add            %7, %9
1193

  
1194
    ; store dwords 3
1195
    psrldq        m%1, 4
1196
    psrldq        m%2, 4
1197
    psrldq        m%3, 4
1198
    psrldq        m%4, 4
1199
    movd      [%5+%8], m%1
1200
    movd    [%6+%9*2], m%2
1201
    movd    [%7+%8*2], m%3
1202
    movd    [%7+%9*2], m%4
1203
%endmacro
1204

  
1205
%macro SIMPLE_LOOPFILTER 3
1206
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1207
%ifidn %2, h
1208
    mov            r5, rsp          ; backup stack pointer
1209
    and           rsp, ~(mmsize-1)  ; align stack
1210
%endif
1211
%if mmsize == 8 ; mmx/mmxext
1212
    mov            r3, 2
1213
%endif
1214

  
1215
    ; splat register with "flim"
1216
    movd           m7, r2
1217
    punpcklbw      m7, m7
1218
%if mmsize == 16 ; sse2
1219
    punpcklwd      m7, m7
1220
    pshufd         m7, m7, 0x0
1221
%elifidn %1, mmx
1222
    punpcklwd      m7, m7
1223
    punpckldq      m7, m7
1224
%else ; mmxext
1225
    pshufw         m7, m7, 0x0
1226
%endif
1227

  
1228
    ; set up indexes to address 4 rows
1229
    mov            r2, r1
1230
    neg            r1
1231
%ifidn %2, h
1232
    lea            r0, [r0+4*r2-2]
1233
    sub           rsp, mmsize*2     ; (aligned) storage space for saving p1/q1
1234
%endif
1235

  
1236
%if mmsize == 8 ; mmx / mmxext
1237
.next8px
1238
%endif
1239
%ifidn %2, v
1240
    ; read 4 half/full rows of pixels
1241
    mova           m0, [r0+r1*2]    ; p1
1242
    mova           m1, [r0+r1]      ; p0
1243
    mova           m2, [r0]         ; q0
1244
    mova           m3, [r0+r2]      ; q1
1245
%else ; h
1246
    lea            r4, [r0+r2]
1247

  
1248
%if mmsize == 8 ; mmx/mmxext
1249
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1250
%else ; sse2
1251
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1252
%endif
1253
    TRANSPOSE4x4W         0, 1, 2, 3, 4
1254

  
1255
    mova        [rsp], m0           ; store p1
1256
    mova [rsp+mmsize], m3           ; store q1
1257
%endif
1258

  
1259
    ; simple_limit
1260
    mova           m5, m2           ; m5=backup of q0
1261
    mova           m6, m1           ; m6=backup of p0
1262
    psubusb        m1, m2           ; p0-q0
1263
    psubusb        m2, m6           ; q0-p0
1264
    por            m1, m2           ; FFABS(p0-q0)
1265
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2
1266

  
1267
    mova           m4, m3
1268
    mova           m2, m0
1269
    psubusb        m3, m0           ; q1-p1
1270
    psubusb        m0, m4           ; p1-q1
1271
    por            m3, m0           ; FFABS(p1-q1)
1272
    mova           m0, [pb_80]
1273
    pxor           m2, m0
1274
    pxor           m4, m0
1275
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
1276
    pand           m3, [pb_FE]
1277
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
1278
    paddusb        m3, m1
1279
    psubusb        m3, m7
1280
    pxor           m1, m1
1281
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1282

  
1283
    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1284
    mova           m4, m5
1285
    pxor           m5, m0
1286
    pxor           m0, m6
1287
    psubsb         m5, m0           ; q0-p0 (signed)
1288
    paddsb         m2, m5
1289
    paddsb         m2, m5
1290
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
1291
    pand           m2, m3           ; apply filter mask (m3)
1292

  
1293
    mova           m3, [pb_F8]
1294
    mova           m1, m2
1295
    paddsb         m2, [pb_4]       ; f1<<3=a+4
1296
    paddsb         m1, [pb_3]       ; f2<<3=a+3
1297
    pand           m2, m3
1298
    pand           m1, m3           ; cache f2<<3
1299

  
1300
    pxor           m0, m0
1301
    pxor           m3, m3
1302
    pcmpgtb        m0, m2           ; which values are <0?
1303
    psubb          m3, m2           ; -f1<<3
1304
    psrlq          m2, 3            ; +f1
1305
    psrlq          m3, 3            ; -f1
1306
    pand           m3, m0
1307
    pandn          m0, m2
1308
    psubusb        m4, m0
1309
    paddusb        m4, m3           ; q0-f1
1310

  
1311
    pxor           m0, m0
1312
    pxor           m3, m3
1313
    pcmpgtb        m0, m1           ; which values are <0?
1314
    psubb          m3, m1           ; -f2<<3
1315
    psrlq          m1, 3            ; +f2
1316
    psrlq          m3, 3            ; -f2
1317
    pand           m3, m0
1318
    pandn          m0, m1
1319
    paddusb        m6, m0
1320
    psubusb        m6, m3           ; p0+f2
1321

  
1322
    ; store
1323
%ifidn %2, v
1324
    mova         [r0], m4
1325
    mova      [r0+r1], m6
1326
%else ; h
1327
    mova           m0, [rsp]        ; p1
1328
    SWAP            2, 4            ; p0
1329
    SWAP            1, 6            ; q0
1330
    mova           m3, [rsp+mmsize] ; q1
1331

  
1332
    TRANSPOSE4x4B  0, 1, 2, 3, 4
1333
%if mmsize == 16 ; sse2
1334
    add            r3, r1           ; change from r4*8*stride to r0+8*stride
1335
    WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2
1336
%else ; mmx/mmxext
1337
    WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
1338
%endif
1339
%endif
1340

  
1341
%if mmsize == 8 ; mmx/mmxext
1342
    ; next 8 pixels
1343
%ifidn %2, v
1344
    add            r0, 8            ; advance 8 cols = pixels
1345
%else ; h
1346
    lea            r0, [r0+r2*8]    ; advance 8 rows = lines
1347
%endif
1348
    dec            r3
1349
    jg .next8px
1350
%ifidn %2, v
1351
    REP_RET
1352
%else ; h
1353
    mov           rsp, r5           ; restore stack pointer
1354
    RET
1355
%endif
1356
%else ; sse2
1357
%ifidn %2, h
1358
    mov           rsp, r5           ; restore stack pointer
1359
%endif
1360
    RET
1361
%endif
1362
%endmacro
1363

  
1364
INIT_MMX
1365
SIMPLE_LOOPFILTER mmx,    v, 4
1366
SIMPLE_LOOPFILTER mmx,    h, 6
1367
SIMPLE_LOOPFILTER mmxext, v, 4
1368
SIMPLE_LOOPFILTER mmxext, h, 6
1369
INIT_XMM
1370
SIMPLE_LOOPFILTER sse2,   v, 3
1371
SIMPLE_LOOPFILTER sse2,   h, 6
libavcodec/x86/x86util.asm
37 37
    SWAP %2, %4, %3
38 38
%endmacro
39 39

  
40
%macro TRANSPOSE4x4B 5
41
    SBUTTERFLY bw, %1, %2, %5
42
    SBUTTERFLY bw, %3, %4, %5
43
    SBUTTERFLY wd, %1, %3, %5
44
    SBUTTERFLY wd, %2, %4, %5
45
    SWAP %2, %3
46
%endmacro
47

  
40 48
%macro TRANSPOSE4x4W 5
41 49
    SBUTTERFLY wd, %1, %2, %5
42 50
    SBUTTERFLY wd, %3, %4, %5

Also available in: Unified diff