Revision 6216fc70

View differences:

libswscale/x86/rgb2rgb.c
81 81

  
82 82
//Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
83 83

  
84
#define COMPILE_TEMPLATE_MMX 0
85 84
#define COMPILE_TEMPLATE_MMX2 0
86 85
#define COMPILE_TEMPLATE_AMD3DNOW 0
87 86
#define COMPILE_TEMPLATE_SSE2 0
88 87

  
89 88
//MMX versions
90 89
#undef RENAME
91
#undef COMPILE_TEMPLATE_MMX
92
#define COMPILE_TEMPLATE_MMX 1
93 90
#define RENAME(a) a ## _MMX
94 91
#include "rgb2rgb_template.c"
95 92

  
libswscale/x86/rgb2rgb_template.c
69 69
    uint8_t *dest = dst;
70 70
    const uint8_t *s = src;
71 71
    const uint8_t *end;
72
#if COMPILE_TEMPLATE_MMX
73 72
    const uint8_t *mm_end;
74
#endif
75 73
    end = s + src_size;
76
#if COMPILE_TEMPLATE_MMX
77 74
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
78 75
    mm_end = end - 23;
79 76
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
......
104 101
    }
105 102
    __asm__ volatile(SFENCE:::"memory");
106 103
    __asm__ volatile(EMMS:::"memory");
107
#endif
108 104
    while (s < end) {
109
#if HAVE_BIGENDIAN
110
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
111
        *dest++ = 255;
112
        *dest++ = s[2];
113
        *dest++ = s[1];
114
        *dest++ = s[0];
115
        s+=3;
116
#else
117 105
        *dest++ = *s++;
118 106
        *dest++ = *s++;
119 107
        *dest++ = *s++;
120 108
        *dest++ = 255;
121
#endif
122 109
    }
123 110
}
124 111

  
......
164 151
    uint8_t *dest = dst;
165 152
    const uint8_t *s = src;
166 153
    const uint8_t *end;
167
#if COMPILE_TEMPLATE_MMX
168 154
    const uint8_t *mm_end;
169
#endif
170 155
    end = s + src_size;
171
#if COMPILE_TEMPLATE_MMX
172 156
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
173 157
    mm_end = end - 31;
174 158
    while (s < mm_end) {
......
191 175
    }
192 176
    __asm__ volatile(SFENCE:::"memory");
193 177
    __asm__ volatile(EMMS:::"memory");
194
#endif
195 178
    while (s < end) {
196
#if HAVE_BIGENDIAN
197
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
198
        s++;
199
        dest[2] = *s++;
200
        dest[1] = *s++;
201
        dest[0] = *s++;
202
        dest += 3;
203
#else
204 179
        *dest++ = *s++;
205 180
        *dest++ = *s++;
206 181
        *dest++ = *s++;
207 182
        s++;
208
#endif
209 183
    }
210 184
}
211 185

  
......
222 196
    register const uint8_t *end;
223 197
    const uint8_t *mm_end;
224 198
    end = s + src_size;
225
#if COMPILE_TEMPLATE_MMX
226 199
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
227 200
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
228 201
    mm_end = end - 15;
......
247 220
    }
248 221
    __asm__ volatile(SFENCE:::"memory");
249 222
    __asm__ volatile(EMMS:::"memory");
250
#endif
251 223
    mm_end = end - 3;
252 224
    while (s < mm_end) {
253 225
        register unsigned x= *((const uint32_t *)s);
......
268 240
    register const uint8_t *end;
269 241
    const uint8_t *mm_end;
270 242
    end = s + src_size;
271
#if COMPILE_TEMPLATE_MMX
272 243
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
273 244
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
274 245
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
......
298 269
    }
299 270
    __asm__ volatile(SFENCE:::"memory");
300 271
    __asm__ volatile(EMMS:::"memory");
301
#endif
302 272
    mm_end = end - 3;
303 273
    while (s < mm_end) {
304 274
        register uint32_t x= *((const uint32_t*)s);
......
316 286
{
317 287
    const uint8_t *s = src;
318 288
    const uint8_t *end;
319
#if COMPILE_TEMPLATE_MMX
320 289
    const uint8_t *mm_end;
321
#endif
322 290
    uint16_t *d = (uint16_t *)dst;
323 291
    end = s + src_size;
324
#if COMPILE_TEMPLATE_MMX
325 292
    mm_end = end - 15;
326 293
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327 294
    __asm__ volatile(
......
401 368
#endif
402 369
    __asm__ volatile(SFENCE:::"memory");
403 370
    __asm__ volatile(EMMS:::"memory");
404
#endif
405 371
    while (s < end) {
406 372
        register int rgb = *(const uint32_t*)s; s += 4;
407 373
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
......
412 378
{
413 379
    const uint8_t *s = src;
414 380
    const uint8_t *end;
415
#if COMPILE_TEMPLATE_MMX
416 381
    const uint8_t *mm_end;
417
#endif
418 382
    uint16_t *d = (uint16_t *)dst;
419 383
    end = s + src_size;
420
#if COMPILE_TEMPLATE_MMX
421 384
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
422 385
    __asm__ volatile(
423 386
        "movq          %0, %%mm7    \n\t"
......
460 423
    }
461 424
    __asm__ volatile(SFENCE:::"memory");
462 425
    __asm__ volatile(EMMS:::"memory");
463
#endif
464 426
    while (s < end) {
465 427
        register int rgb = *(const uint32_t*)s; s += 4;
466 428
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
......
471 433
{
472 434
    const uint8_t *s = src;
473 435
    const uint8_t *end;
474
#if COMPILE_TEMPLATE_MMX
475 436
    const uint8_t *mm_end;
476
#endif
477 437
    uint16_t *d = (uint16_t *)dst;
478 438
    end = s + src_size;
479
#if COMPILE_TEMPLATE_MMX
480 439
    mm_end = end - 15;
481 440
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482 441
    __asm__ volatile(
......
556 515
#endif
557 516
    __asm__ volatile(SFENCE:::"memory");
558 517
    __asm__ volatile(EMMS:::"memory");
559
#endif
560 518
    while (s < end) {
561 519
        register int rgb = *(const uint32_t*)s; s += 4;
562 520
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
......
567 525
{
568 526
    const uint8_t *s = src;
569 527
    const uint8_t *end;
570
#if COMPILE_TEMPLATE_MMX
571 528
    const uint8_t *mm_end;
572
#endif
573 529
    uint16_t *d = (uint16_t *)dst;
574 530
    end = s + src_size;
575
#if COMPILE_TEMPLATE_MMX
576 531
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
577 532
    __asm__ volatile(
578 533
        "movq          %0, %%mm7    \n\t"
......
615 570
    }
616 571
    __asm__ volatile(SFENCE:::"memory");
617 572
    __asm__ volatile(EMMS:::"memory");
618
#endif
619 573
    while (s < end) {
620 574
        register int rgb = *(const uint32_t*)s; s += 4;
621 575
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
......
626 580
{
627 581
    const uint8_t *s = src;
628 582
    const uint8_t *end;
629
#if COMPILE_TEMPLATE_MMX
630 583
    const uint8_t *mm_end;
631
#endif
632 584
    uint16_t *d = (uint16_t *)dst;
633 585
    end = s + src_size;
634
#if COMPILE_TEMPLATE_MMX
635 586
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
636 587
    __asm__ volatile(
637 588
        "movq         %0, %%mm7     \n\t"
......
674 625
    }
675 626
    __asm__ volatile(SFENCE:::"memory");
676 627
    __asm__ volatile(EMMS:::"memory");
677
#endif
678 628
    while (s < end) {
679 629
        const int b = *s++;
680 630
        const int g = *s++;
......
687 637
{
688 638
    const uint8_t *s = src;
689 639
    const uint8_t *end;
690
#if COMPILE_TEMPLATE_MMX
691 640
    const uint8_t *mm_end;
692
#endif
693 641
    uint16_t *d = (uint16_t *)dst;
694 642
    end = s + src_size;
695
#if COMPILE_TEMPLATE_MMX
696 643
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
697 644
    __asm__ volatile(
698 645
        "movq         %0, %%mm7     \n\t"
......
735 682
    }
736 683
    __asm__ volatile(SFENCE:::"memory");
737 684
    __asm__ volatile(EMMS:::"memory");
738
#endif
739 685
    while (s < end) {
740 686
        const int r = *s++;
741 687
        const int g = *s++;
......
748 694
{
749 695
    const uint8_t *s = src;
750 696
    const uint8_t *end;
751
#if COMPILE_TEMPLATE_MMX
752 697
    const uint8_t *mm_end;
753
#endif
754 698
    uint16_t *d = (uint16_t *)dst;
755 699
    end = s + src_size;
756
#if COMPILE_TEMPLATE_MMX
757 700
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
758 701
    __asm__ volatile(
759 702
        "movq          %0, %%mm7    \n\t"
......
796 739
    }
797 740
    __asm__ volatile(SFENCE:::"memory");
798 741
    __asm__ volatile(EMMS:::"memory");
799
#endif
800 742
    while (s < end) {
801 743
        const int b = *s++;
802 744
        const int g = *s++;
......
809 751
{
810 752
    const uint8_t *s = src;
811 753
    const uint8_t *end;
812
#if COMPILE_TEMPLATE_MMX
813 754
    const uint8_t *mm_end;
814
#endif
815 755
    uint16_t *d = (uint16_t *)dst;
816 756
    end = s + src_size;
817
#if COMPILE_TEMPLATE_MMX
818 757
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
819 758
    __asm__ volatile(
820 759
        "movq         %0, %%mm7     \n\t"
......
857 796
    }
858 797
    __asm__ volatile(SFENCE:::"memory");
859 798
    __asm__ volatile(EMMS:::"memory");
860
#endif
861 799
    while (s < end) {
862 800
        const int r = *s++;
863 801
        const int g = *s++;
......
890 828
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
891 829
{
892 830
    const uint16_t *end;
893
#if COMPILE_TEMPLATE_MMX
894 831
    const uint16_t *mm_end;
895
#endif
896 832
    uint8_t *d = dst;
897 833
    const uint16_t *s = (const uint16_t*)src;
898 834
    end = s + src_size/2;
899
#if COMPILE_TEMPLATE_MMX
900 835
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
901 836
    mm_end = end - 7;
902 837
    while (s < mm_end) {
......
984 919
    }
985 920
    __asm__ volatile(SFENCE:::"memory");
986 921
    __asm__ volatile(EMMS:::"memory");
987
#endif
988 922
    while (s < end) {
989 923
        register uint16_t bgr;
990 924
        bgr = *s++;
......
997 931
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
998 932
{
999 933
    const uint16_t *end;
1000
#if COMPILE_TEMPLATE_MMX
1001 934
    const uint16_t *mm_end;
1002
#endif
1003 935
    uint8_t *d = (uint8_t *)dst;
1004 936
    const uint16_t *s = (const uint16_t *)src;
1005 937
    end = s + src_size/2;
1006
#if COMPILE_TEMPLATE_MMX
1007 938
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1008 939
    mm_end = end - 7;
1009 940
    while (s < mm_end) {
......
1090 1021
    }
1091 1022
    __asm__ volatile(SFENCE:::"memory");
1092 1023
    __asm__ volatile(EMMS:::"memory");
1093
#endif
1094 1024
    while (s < end) {
1095 1025
        register uint16_t bgr;
1096 1026
        bgr = *s++;
......
1122 1052
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1123 1053
{
1124 1054
    const uint16_t *end;
1125
#if COMPILE_TEMPLATE_MMX
1126 1055
    const uint16_t *mm_end;
1127
#endif
1128 1056
    uint8_t *d = dst;
1129 1057
    const uint16_t *s = (const uint16_t *)src;
1130 1058
    end = s + src_size/2;
1131
#if COMPILE_TEMPLATE_MMX
1132 1059
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1133 1060
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1134 1061
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
......
1154 1081
    }
1155 1082
    __asm__ volatile(SFENCE:::"memory");
1156 1083
    __asm__ volatile(EMMS:::"memory");
1157
#endif
1158 1084
    while (s < end) {
1159 1085
        register uint16_t bgr;
1160 1086
        bgr = *s++;
1161
#if HAVE_BIGENDIAN
1162
        *d++ = 255;
1163
        *d++ = (bgr&0x7C00)>>7;
1164
        *d++ = (bgr&0x3E0)>>2;
1165
        *d++ = (bgr&0x1F)<<3;
1166
#else
1167 1087
        *d++ = (bgr&0x1F)<<3;
1168 1088
        *d++ = (bgr&0x3E0)>>2;
1169 1089
        *d++ = (bgr&0x7C00)>>7;
1170 1090
        *d++ = 255;
1171
#endif
1172 1091
    }
1173 1092
}
1174 1093

  
1175 1094
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1176 1095
{
1177 1096
    const uint16_t *end;
1178
#if COMPILE_TEMPLATE_MMX
1179 1097
    const uint16_t *mm_end;
1180
#endif
1181 1098
    uint8_t *d = dst;
1182 1099
    const uint16_t *s = (const uint16_t*)src;
1183 1100
    end = s + src_size/2;
1184
#if COMPILE_TEMPLATE_MMX
1185 1101
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1186 1102
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1187 1103
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
......
1207 1123
    }
1208 1124
    __asm__ volatile(SFENCE:::"memory");
1209 1125
    __asm__ volatile(EMMS:::"memory");
1210
#endif
1211 1126
    while (s < end) {
1212 1127
        register uint16_t bgr;
1213 1128
        bgr = *s++;
1214
#if HAVE_BIGENDIAN
1215
        *d++ = 255;
1216
        *d++ = (bgr&0xF800)>>8;
1217
        *d++ = (bgr&0x7E0)>>3;
1218
        *d++ = (bgr&0x1F)<<3;
1219
#else
1220 1129
        *d++ = (bgr&0x1F)<<3;
1221 1130
        *d++ = (bgr&0x7E0)>>3;
1222 1131
        *d++ = (bgr&0xF800)>>8;
1223 1132
        *d++ = 255;
1224
#endif
1225 1133
    }
1226 1134
}
1227 1135

  
......
1230 1138
    x86_reg idx = 15 - src_size;
1231 1139
    const uint8_t *s = src-idx;
1232 1140
    uint8_t *d = dst-idx;
1233
#if COMPILE_TEMPLATE_MMX
1234 1141
    __asm__ volatile(
1235 1142
        "test          %0, %0           \n\t"
1236 1143
        "jns           2f               \n\t"
......
1281 1188
        : "+&r"(idx)
1282 1189
        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1283 1190
        : "memory");
1284
#endif
1285 1191
    for (; idx<15; idx+=4) {
1286 1192
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1287 1193
        v &= 0xff00ff;
......
1292 1198
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1293 1199
{
1294 1200
    unsigned i;
1295
#if COMPILE_TEMPLATE_MMX
1296 1201
    x86_reg mmx_size= 23 - src_size;
1297 1202
    __asm__ volatile (
1298 1203
        "test             %%"REG_a", %%"REG_a"          \n\t"
......
1348 1253
    src_size= 23-mmx_size;
1349 1254
    src-= src_size;
1350 1255
    dst-= src_size;
1351
#endif
1352 1256
    for (i=0; i<src_size; i+=3) {
1353 1257
        register uint8_t x;
1354 1258
        x          = src[i + 2];
......
1365 1269
    long y;
1366 1270
    const x86_reg chromWidth= width>>1;
1367 1271
    for (y=0; y<height; y++) {
1368
#if COMPILE_TEMPLATE_MMX
1369 1272
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1370 1273
        __asm__ volatile(
1371 1274
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
......
1400 1303
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1401 1304
            : "%"REG_a
1402 1305
        );
1403
#else
1404

  
1405
#if ARCH_ALPHA && HAVE_MVI
1406
#define pl2yuy2(n)                  \
1407
    y1 = yc[n];                     \
1408
    y2 = yc2[n];                    \
1409
    u = uc[n];                      \
1410
    v = vc[n];                      \
1411
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1412
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1413
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1414
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1415
    yuv1 = (u << 8) + (v << 24);                \
1416
    yuv2 = yuv1 + y2;               \
1417
    yuv1 += y1;                     \
1418
    qdst[n]  = yuv1;                \
1419
    qdst2[n] = yuv2;
1420

  
1421
        int i;
1422
        uint64_t *qdst = (uint64_t *) dst;
1423
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424
        const uint32_t *yc = (uint32_t *) ysrc;
1425
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427
        for (i = 0; i < chromWidth; i += 8) {
1428
            uint64_t y1, y2, yuv1, yuv2;
1429
            uint64_t u, v;
1430
            /* Prefetch */
1431
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1432
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433
            __asm__("ldq $31,64(%0)" :: "r"(uc));
1434
            __asm__("ldq $31,64(%0)" :: "r"(vc));
1435

  
1436
            pl2yuy2(0);
1437
            pl2yuy2(1);
1438
            pl2yuy2(2);
1439
            pl2yuy2(3);
1440

  
1441
            yc    += 4;
1442
            yc2   += 4;
1443
            uc    += 4;
1444
            vc    += 4;
1445
            qdst  += 4;
1446
            qdst2 += 4;
1447
        }
1448
        y++;
1449
        ysrc += lumStride;
1450
        dst += dstStride;
1451

  
1452
#elif HAVE_FAST_64BIT
1453
        int i;
1454
        uint64_t *ldst = (uint64_t *) dst;
1455
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456
        for (i = 0; i < chromWidth; i += 2) {
1457
            uint64_t k, l;
1458
            k = yc[0] + (uc[0] << 8) +
1459
                (yc[1] << 16) + (vc[0] << 24);
1460
            l = yc[2] + (uc[1] << 8) +
1461
                (yc[3] << 16) + (vc[1] << 24);
1462
            *ldst++ = k + (l << 32);
1463
            yc += 4;
1464
            uc += 2;
1465
            vc += 2;
1466
        }
1467

  
1468
#else
1469
        int i, *idst = (int32_t *) dst;
1470
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471
        for (i = 0; i < chromWidth; i++) {
1472
#if HAVE_BIGENDIAN
1473
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474
                (yc[1] << 8) + (vc[0] << 0);
1475
#else
1476
            *idst++ = yc[0] + (uc[0] << 8) +
1477
                (yc[1] << 16) + (vc[0] << 24);
1478
#endif
1479
            yc += 2;
1480
            uc++;
1481
            vc++;
1482
        }
1483
#endif
1484
#endif
1485 1306
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486 1307
            usrc += chromStride;
1487 1308
            vsrc += chromStride;
......
1489 1310
        ysrc += lumStride;
1490 1311
        dst  += dstStride;
1491 1312
    }
1492
#if COMPILE_TEMPLATE_MMX
1493 1313
    __asm__(EMMS"       \n\t"
1494 1314
            SFENCE"     \n\t"
1495 1315
            :::"memory");
1496
#endif
1497 1316
}
1498 1317

  
1499 1318
/**
......
1515 1334
    long y;
1516 1335
    const x86_reg chromWidth= width>>1;
1517 1336
    for (y=0; y<height; y++) {
1518
#if COMPILE_TEMPLATE_MMX
1519 1337
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1520 1338
        __asm__ volatile(
1521 1339
            "xor                %%"REG_a", %%"REG_a"    \n\t"
......
1550 1368
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1551 1369
            : "%"REG_a
1552 1370
        );
1553
#else
1554
//FIXME adapt the Alpha ASM code from yv12->yuy2
1555

  
1556
#if HAVE_FAST_64BIT
1557
        int i;
1558
        uint64_t *ldst = (uint64_t *) dst;
1559
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560
        for (i = 0; i < chromWidth; i += 2) {
1561
            uint64_t k, l;
1562
            k = uc[0] + (yc[0] << 8) +
1563
                (vc[0] << 16) + (yc[1] << 24);
1564
            l = uc[1] + (yc[2] << 8) +
1565
                (vc[1] << 16) + (yc[3] << 24);
1566
            *ldst++ = k + (l << 32);
1567
            yc += 4;
1568
            uc += 2;
1569
            vc += 2;
1570
        }
1571

  
1572
#else
1573
        int i, *idst = (int32_t *) dst;
1574
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575
        for (i = 0; i < chromWidth; i++) {
1576
#if HAVE_BIGENDIAN
1577
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578
                (vc[0] << 8) + (yc[1] << 0);
1579
#else
1580
            *idst++ = uc[0] + (yc[0] << 8) +
1581
               (vc[0] << 16) + (yc[1] << 24);
1582
#endif
1583
            yc += 2;
1584
            uc++;
1585
            vc++;
1586
        }
1587
#endif
1588
#endif
1589 1371
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590 1372
            usrc += chromStride;
1591 1373
            vsrc += chromStride;
......
1593 1375
        ysrc += lumStride;
1594 1376
        dst += dstStride;
1595 1377
    }
1596
#if COMPILE_TEMPLATE_MMX
1597 1378
    __asm__(EMMS"       \n\t"
1598 1379
            SFENCE"     \n\t"
1599 1380
            :::"memory");
1600
#endif
1601 1381
}
1602 1382

  
1603 1383
/**
......
1643 1423
    long y;
1644 1424
    const x86_reg chromWidth= width>>1;
1645 1425
    for (y=0; y<height; y+=2) {
1646
#if COMPILE_TEMPLATE_MMX
1647 1426
        __asm__ volatile(
1648 1427
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1649 1428
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
......
1725 1504
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726 1505
            : "memory", "%"REG_a
1727 1506
        );
1728
#else
1729
        long i;
1730
        for (i=0; i<chromWidth; i++) {
1731
            ydst[2*i+0]     = src[4*i+0];
1732
            udst[i]     = src[4*i+1];
1733
            ydst[2*i+1]     = src[4*i+2];
1734
            vdst[i]     = src[4*i+3];
1735
        }
1736
        ydst += lumStride;
1737
        src  += srcStride;
1738

  
1739
        for (i=0; i<chromWidth; i++) {
1740
            ydst[2*i+0]     = src[4*i+0];
1741
            ydst[2*i+1]     = src[4*i+2];
1742
        }
1743
#endif
1744 1507
        udst += chromStride;
1745 1508
        vdst += chromStride;
1746 1509
        ydst += lumStride;
1747 1510
        src  += srcStride;
1748 1511
    }
1749
#if COMPILE_TEMPLATE_MMX
1750 1512
    __asm__ volatile(EMMS"       \n\t"
1751 1513
                     SFENCE"     \n\t"
1752 1514
                     :::"memory");
1753
#endif
1754 1515
}
1755 1516

  
1756 1517
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
......
1859 1620
    }
1860 1621
#endif
1861 1622

  
1862
#if COMPILE_TEMPLATE_MMX
1863 1623
    __asm__ volatile(EMMS"       \n\t"
1864 1624
                     SFENCE"     \n\t"
1865 1625
                     :::"memory");
1866
#endif
1867 1626
}
1868 1627

  
1869 1628
/**
......
1879 1638
    long y;
1880 1639
    const x86_reg chromWidth= width>>1;
1881 1640
    for (y=0; y<height; y+=2) {
1882
#if COMPILE_TEMPLATE_MMX
1883 1641
        __asm__ volatile(
1884 1642
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1885 1643
            "pcmpeqw             %%mm7, %%mm7   \n\t"
......
1961 1719
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1962 1720
            : "memory", "%"REG_a
1963 1721
        );
1964
#else
1965
        long i;
1966
        for (i=0; i<chromWidth; i++) {
1967
            udst[i]     = src[4*i+0];
1968
            ydst[2*i+0] = src[4*i+1];
1969
            vdst[i]     = src[4*i+2];
1970
            ydst[2*i+1] = src[4*i+3];
1971
        }
1972
        ydst += lumStride;
1973
        src  += srcStride;
1974

  
1975
        for (i=0; i<chromWidth; i++) {
1976
            ydst[2*i+0] = src[4*i+1];
1977
            ydst[2*i+1] = src[4*i+3];
1978
        }
1979
#endif
1980 1722
        udst += chromStride;
1981 1723
        vdst += chromStride;
1982 1724
        ydst += lumStride;
1983 1725
        src  += srcStride;
1984 1726
    }
1985
#if COMPILE_TEMPLATE_MMX
1986 1727
    __asm__ volatile(EMMS"       \n\t"
1987 1728
                     SFENCE"     \n\t"
1988 1729
                     :::"memory");
1989
#endif
1990 1730
}
1991 1731

  
1992 1732
/**
......
2002 1742
{
2003 1743
    long y;
2004 1744
    const x86_reg chromWidth= width>>1;
2005
#if COMPILE_TEMPLATE_MMX
2006 1745
    for (y=0; y<height-2; y+=2) {
2007 1746
        long i;
2008 1747
        for (i=0; i<2; i++) {
......
2236 1975
    __asm__ volatile(EMMS"       \n\t"
2237 1976
                     SFENCE"     \n\t"
2238 1977
                     :::"memory");
2239
#else
2240
    y=0;
2241
#endif
1978

  
2242 1979
    for (; y<height; y+=2) {
2243 1980
        long i;
2244 1981
        for (i=0; i<chromWidth; i++) {
......
2296 2033
    for (h=0; h < height; h++) {
2297 2034
        long w;
2298 2035

  
2299
#if COMPILE_TEMPLATE_MMX
2300 2036
#if COMPILE_TEMPLATE_SSE2
2301 2037
        __asm__(
2302 2038
            "xor              %%"REG_a", %%"REG_a"  \n\t"
......
2347 2083
            dest[2*w+0] = src1[w];
2348 2084
            dest[2*w+1] = src2[w];
2349 2085
        }
2350
#else
2351
        for (w=0; w < width; w++) {
2352
            dest[2*w+0] = src1[w];
2353
            dest[2*w+1] = src2[w];
2354
        }
2355
#endif
2356 2086
        dest += dstStride;
2357 2087
        src1 += src1Stride;
2358 2088
        src2 += src2Stride;
2359 2089
    }
2360
#if COMPILE_TEMPLATE_MMX
2361 2090
    __asm__(
2362 2091
            EMMS"       \n\t"
2363 2092
            SFENCE"     \n\t"
2364 2093
            ::: "memory"
2365 2094
            );
2366
#endif
2367 2095
}
2368 2096

  
2369 2097
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
......
2375 2103
    x86_reg y;
2376 2104
    long x,w,h;
2377 2105
    w=width/2; h=height/2;
2378
#if COMPILE_TEMPLATE_MMX
2379 2106
    __asm__ volatile(
2380 2107
        PREFETCH" %0    \n\t"
2381 2108
        PREFETCH" %1    \n\t"
2382 2109
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2383
#endif
2384 2110
    for (y=0;y<h;y++) {
2385 2111
        const uint8_t* s1=src1+srcStride1*(y>>1);
2386 2112
        uint8_t* d=dst1+dstStride1*y;
2387 2113
        x=0;
2388
#if COMPILE_TEMPLATE_MMX
2389 2114
        for (;x<w-31;x+=32) {
2390 2115
            __asm__ volatile(
2391 2116
                PREFETCH"   32%1        \n\t"
......
2417 2142
                :"m"(s1[x])
2418 2143
                :"memory");
2419 2144
        }
2420
#endif
2421 2145
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2422 2146
    }
2423 2147
    for (y=0;y<h;y++) {
2424 2148
        const uint8_t* s2=src2+srcStride2*(y>>1);
2425 2149
        uint8_t* d=dst2+dstStride2*y;
2426 2150
        x=0;
2427
#if COMPILE_TEMPLATE_MMX
2428 2151
        for (;x<w-31;x+=32) {
2429 2152
            __asm__ volatile(
2430 2153
                PREFETCH"   32%1        \n\t"
......
2456 2179
                :"m"(s2[x])
2457 2180
                :"memory");
2458 2181
        }
2459
#endif
2460 2182
        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2461 2183
    }
2462
#if COMPILE_TEMPLATE_MMX
2463 2184
    __asm__(
2464 2185
            EMMS"       \n\t"
2465 2186
            SFENCE"     \n\t"
2466 2187
            ::: "memory"
2467 2188
        );
2468
#endif
2469 2189
}
2470 2190

  
2471 2191
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
......
2483 2203
        const uint8_t* vp=src3+srcStride3*(y>>2);
2484 2204
        uint8_t* d=dst+dstStride*y;
2485 2205
        x=0;
2486
#if COMPILE_TEMPLATE_MMX
2487 2206
        for (;x<w-7;x+=8) {
2488 2207
            __asm__ volatile(
2489 2208
                PREFETCH"   32(%1, %0)          \n\t"
......
2536 2255
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2537 2256
                :"memory");
2538 2257
        }
2539
#endif
2540 2258
        for (; x<w; x++) {
2541 2259
            const long x2 = x<<2;
2542 2260
            d[8*x+0] = yp[x2];
......
2549 2267
            d[8*x+7] = vp[x];
2550 2268
        }
2551 2269
    }
2552
#if COMPILE_TEMPLATE_MMX
2553 2270
    __asm__(
2554 2271
            EMMS"       \n\t"
2555 2272
            SFENCE"     \n\t"
2556 2273
            ::: "memory"
2557 2274
        );
2558
#endif
2559 2275
}
2560 2276

  
2561 2277
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
......
2564 2280
    src += 2*count;
2565 2281
    count= - count;
2566 2282

  
2567
#if COMPILE_TEMPLATE_MMX
2568 2283
    if(count <= -16) {
2569 2284
        count += 15;
2570 2285
        __asm__ volatile(
......
2590 2305
        );
2591 2306
        count -= 15;
2592 2307
    }
2593
#endif
2594 2308
    while(count<0) {
2595 2309
        dst[count]= src[2*count];
2596 2310
        count++;
......
2603 2317
    dst1+=   count;
2604 2318
    src += 4*count;
2605 2319
    count= - count;
2606
#if COMPILE_TEMPLATE_MMX
2607 2320
    if(count <= -8) {
2608 2321
        count += 7;
2609 2322
        __asm__ volatile(
......
2637 2350
        );
2638 2351
        count -= 7;
2639 2352
    }
2640
#endif
2641 2353
    while(count<0) {
2642 2354
        dst0[count]= src[4*count+0];
2643 2355
        dst1[count]= src[4*count+2];
......
2704 2416
    dst1+=   count;
2705 2417
    src += 4*count;
2706 2418
    count= - count;
2707
#if COMPILE_TEMPLATE_MMX
2708 2419
    if(count <= -8) {
2709 2420
        count += 7;
2710 2421
        __asm__ volatile(
......
2738 2449
        );
2739 2450
        count -= 7;
2740 2451
    }
2741
#endif
2742 2452
    src++;
2743 2453
    while(count<0) {
2744 2454
        dst0[count]= src[4*count+0];
......
2820 2530
        src += srcStride;
2821 2531
        ydst+= lumStride;
2822 2532
    }
2823
#if COMPILE_TEMPLATE_MMX
2824 2533
    __asm__(
2825 2534
            EMMS"       \n\t"
2826 2535
            SFENCE"     \n\t"
2827 2536
            ::: "memory"
2828 2537
        );
2829
#endif
2830 2538
}
2831 2539

  
2832 2540
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
......
2845 2553
        udst+= chromStride;
2846 2554
        vdst+= chromStride;
2847 2555
    }
2848
#if COMPILE_TEMPLATE_MMX
2849 2556
    __asm__(
2850 2557
            EMMS"       \n\t"
2851 2558
            SFENCE"     \n\t"
2852 2559
            ::: "memory"
2853 2560
        );
2854
#endif
2855 2561
}
2856 2562

  
2857 2563
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
......
2872 2578
        src += srcStride;
2873 2579
        ydst+= lumStride;
2874 2580
    }
2875
#if COMPILE_TEMPLATE_MMX
2876 2581
    __asm__(
2877 2582
            EMMS"       \n\t"
2878 2583
            SFENCE"     \n\t"
2879 2584
            ::: "memory"
2880 2585
        );
2881
#endif
2882 2586
}
2883 2587

  
2884 2588
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
......
2897 2601
        udst+= chromStride;
2898 2602
        vdst+= chromStride;
2899 2603
    }
2900
#if COMPILE_TEMPLATE_MMX
2901 2604
    __asm__(
2902 2605
            EMMS"       \n\t"
2903 2606
            SFENCE"     \n\t"
2904 2607
            ::: "memory"
2905 2608
        );
2906
#endif
2907 2609
}
2908 2610

  
2909 2611
static inline void RENAME(rgb2rgb_init)(void)

Also available in: Unified diff