Revision 36e8de07

View differences:

libswscale/swscale_template.c
385 385
    "packuswb        %%mm6, %%mm5       \n\t"\
386 386
    "packuswb        %%mm3, %%mm4       \n\t"\
387 387
    "pxor            %%mm7, %%mm7       \n\t"
388
#if 0
389
#define FULL_YSCALEYUV2RGB \
390
    "pxor                 %%mm7, %%mm7  \n\t"\
391
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
392
    "punpcklwd            %%mm6, %%mm6  \n\t"\
393
    "punpcklwd            %%mm6, %%mm6  \n\t"\
394
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
395
    "punpcklwd            %%mm5, %%mm5  \n\t"\
396
    "punpcklwd            %%mm5, %%mm5  \n\t"\
397
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
398
    ASMALIGN(4)\
399
    "1:                                 \n\t"\
400
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
401
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
402
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
403
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
404
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
410
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412
    "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
413
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
416
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
417
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
418
\
419
\
420
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
422
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
423
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
425
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
427
\
428
\
429
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
430
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
431
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
432
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
433
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
434
    "packuswb             %%mm3, %%mm3  \n\t"\
435
\
436
    "packuswb             %%mm0, %%mm0  \n\t"\
437
    "paddw                %%mm4, %%mm2  \n\t"\
438
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
439
\
440
    "packuswb             %%mm1, %%mm1  \n\t"
441
#endif
442 388

  
443 389
#define REAL_YSCALEYUV2PACKED(index, c) \
444 390
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
......
1213 1159
    int uvalpha1=4095-uvalpha;
1214 1160
    int i;
1215 1161

  
1216
#if 0 //isn't used
1217
    if (flags&SWS_FULL_CHR_H_INT)
1218
    {
1219
        switch(dstFormat)
1220
        {
1221
#ifdef HAVE_MMX
1222
        case PIX_FMT_RGB32:
1223
            __asm__ volatile(
1224

  
1225

  
1226
FULL_YSCALEYUV2RGB
1227
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1228
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1229

  
1230
            "movq      %%mm3, %%mm1    \n\t"
1231
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1232
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1233

  
1234
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1235
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1236

  
1237
            "add $4, %%"REG_a"  \n\t"
1238
            "cmp %5, %%"REG_a"  \n\t"
1239
            " jb 1b             \n\t"
1240

  
1241
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242
            "m" (yalpha1), "m" (uvalpha1)
1243
            : "%"REG_a
1244
            );
1245
            break;
1246
        case PIX_FMT_BGR24:
1247
            __asm__ volatile(
1248

  
1249
FULL_YSCALEYUV2RGB
1250

  
1251
                                              // lsb ... msb
1252
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1253
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1254

  
1255
            "movq      %%mm3, %%mm1     \n\t"
1256
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1257
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1258

  
1259
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1260
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1261
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1262
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1263
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1264
            "movq      %%mm1, %%mm2     \n\t"
1265
            "psllq       $48, %%mm1     \n\t" // 000000BG
1266
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1267

  
1268
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1269
            "psrld       $16, %%mm2     \n\t" // R000R000
1270
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1271
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1272

  
1273
            "mov          %4, %%"REG_b" \n\t"
1274
            "add   %%"REG_a", %%"REG_b" \n\t"
1275

  
1276
#ifdef HAVE_MMX2
1277
            //FIXME Alignment
1278
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1279
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1280
#else
1281
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1282
            "psrlq  $32, %%mm3                          \n\t"
1283
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1284
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1285
#endif
1286
            "add     $4, %%"REG_a"                      \n\t"
1287
            "cmp     %5, %%"REG_a"                      \n\t"
1288
            " jb     1b                                 \n\t"
1289

  
1290
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291
            "m" (yalpha1), "m" (uvalpha1)
1292
            : "%"REG_a, "%"REG_b
1293
            );
1294
            break;
1295
        case PIX_FMT_BGR555:
1296
            __asm__ volatile(
1297

  
1298
FULL_YSCALEYUV2RGB
1299
#ifdef DITHER1XBPP
1300
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1301
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1302
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1303
#endif
1304
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1305
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1306
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1307

  
1308
            "psrlw                   $3, %%mm3  \n\t"
1309
            "psllw                   $2, %%mm1  \n\t"
1310
            "psllw                   $7, %%mm0  \n\t"
1311
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1312
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1313

  
1314
            "por                  %%mm3, %%mm1  \n\t"
1315
            "por                  %%mm1, %%mm0  \n\t"
1316

  
1317
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1318

  
1319
            "add $4, %%"REG_a"  \n\t"
1320
            "cmp %5, %%"REG_a"  \n\t"
1321
            " jb 1b             \n\t"
1322

  
1323
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324
            "m" (yalpha1), "m" (uvalpha1)
1325
            : "%"REG_a
1326
            );
1327
            break;
1328
        case PIX_FMT_BGR565:
1329
            __asm__ volatile(
1330

  
1331
FULL_YSCALEYUV2RGB
1332
#ifdef DITHER1XBPP
1333
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1334
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1335
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1336
#endif
1337
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1338
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1339
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1340

  
1341
            "psrlw                   $3, %%mm3  \n\t"
1342
            "psllw                   $3, %%mm1  \n\t"
1343
            "psllw                   $8, %%mm0  \n\t"
1344
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1345
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1346

  
1347
            "por                  %%mm3, %%mm1  \n\t"
1348
            "por                  %%mm1, %%mm0  \n\t"
1349

  
1350
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1351

  
1352
            "add $4, %%"REG_a"  \n\t"
1353
            "cmp %5, %%"REG_a"  \n\t"
1354
            " jb 1b             \n\t"
1355

  
1356
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357
            "m" (yalpha1), "m" (uvalpha1)
1358
            : "%"REG_a
1359
            );
1360
            break;
1361
#endif /* HAVE_MMX */
1362
        case PIX_FMT_BGR32:
1363
#ifndef HAVE_MMX
1364
        case PIX_FMT_RGB32:
1365
#endif
1366
            if (dstFormat==PIX_FMT_RGB32)
1367
            {
1368
                int i;
1369
#ifdef WORDS_BIGENDIAN
1370
                dest++;
1371
#endif
1372
                for (i=0;i<dstW;i++){
1373
                    // vertical linear interpolation && yuv2rgb in a single step:
1374
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1376
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1377
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1380
                    dest+= 4;
1381
                }
1382
            }
1383
            else if (dstFormat==PIX_FMT_BGR24)
1384
            {
1385
                int i;
1386
                for (i=0;i<dstW;i++){
1387
                    // vertical linear interpolation && yuv2rgb in a single step:
1388
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1391
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1394
                    dest+= 3;
1395
                }
1396
            }
1397
            else if (dstFormat==PIX_FMT_BGR565)
1398
            {
1399
                int i;
1400
                for (i=0;i<dstW;i++){
1401
                    // vertical linear interpolation && yuv2rgb in a single step:
1402
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1404
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1405

  
1406
                    ((uint16_t*)dest)[i] =
1407
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1410
                }
1411
            }
1412
            else if (dstFormat==PIX_FMT_BGR555)
1413
            {
1414
                int i;
1415
                for (i=0;i<dstW;i++){
1416
                    // vertical linear interpolation && yuv2rgb in a single step:
1417
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1419
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1420

  
1421
                    ((uint16_t*)dest)[i] =
1422
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1425
                }
1426
            }
1427
        }//FULL_UV_IPOL
1428
    else
1429
    {
1430
#endif // if 0
1431 1162
#ifdef HAVE_MMX
1432 1163
    if(!(c->flags & SWS_BITEXACT)){
1433 1164
        switch(c->dstFormat)

Also available in: Unified diff