## ffmpeg / libavcodec / x86 / vp6dsp_sse2.c @ b10fa1bb

History | View | Annotate | Download (4.43 KB)

1 | ecb24904 | Zuxy Meng | ```
/**
``` |
---|---|---|---|

2 | ```
* @file libavcodec/x86/vp6dsp_mmx.c
``` |
||

3 | ```
* SSE2-optimized functions for the VP6 decoder
``` |
||

4 | ```
*
``` |
||

5 | ```
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
``` |
||

6 | ```
*
``` |
||

7 | ```
* This file is part of FFmpeg.
``` |
||

8 | ```
*
``` |
||

9 | ```
* FFmpeg is free software; you can redistribute it and/or
``` |
||

10 | ```
* modify it under the terms of the GNU Lesser General Public
``` |
||

11 | ```
* License as published by the Free Software Foundation; either
``` |
||

12 | ```
* version 2.1 of the License, or (at your option) any later version.
``` |
||

13 | ```
*
``` |
||

14 | ```
* FFmpeg is distributed in the hope that it will be useful,
``` |
||

15 | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
``` |
||

16 | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
``` |
||

17 | ```
* Lesser General Public License for more details.
``` |
||

18 | ```
*
``` |
||

19 | ```
* You should have received a copy of the GNU Lesser General Public
``` |
||

20 | ```
* License along with FFmpeg; if not, write to the Free Software
``` |
||

21 | ```
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
``` |
||

22 | ```
*/
``` |
||

23 | |||

24 | #include "libavutil/x86_cpu.h" |
||

25 | #include "libavcodec/dsputil.h" |
||

26 | #include "dsputil_mmx.h" |
||

27 | #include "vp6dsp_sse2.h" |
||

28 | |||

29 | ```
#define DIAG4_SSE2(in1,in2,in3,in4) \
``` |
||

30 | "movq "#in1"(%0), %%xmm0 \n\t" \ |
||

31 | "movq "#in2"(%0), %%xmm1 \n\t" \ |
||

32 | ```
"punpcklbw %%xmm7, %%xmm0 \n\t" \
``` |
||

33 | ```
"punpcklbw %%xmm7, %%xmm1 \n\t" \
``` |
||

34 | "pmullw %%xmm4, %%xmm0 \n\t" /* src[x-8 ] * biweight [0] */ \ |
||

35 | "pmullw %%xmm5, %%xmm1 \n\t" /* src[x ] * biweight [1] */ \ |
||

36 | ```
"paddw %%xmm1, %%xmm0 \n\t" \
``` |
||

37 | "movq "#in3"(%0), %%xmm1 \n\t" \ |
||

38 | "movq "#in4"(%0), %%xmm2 \n\t" \ |
||

39 | ```
"punpcklbw %%xmm7, %%xmm1 \n\t" \
``` |
||

40 | ```
"punpcklbw %%xmm7, %%xmm2 \n\t" \
``` |
||

41 | "pmullw %%xmm6, %%xmm1 \n\t" /* src[x+8 ] * biweight [2] */ \ |
||

42 | "pmullw %%xmm3, %%xmm2 \n\t" /* src[x+16] * biweight [3] */ \ |
||

43 | ```
"paddw %%xmm2, %%xmm1 \n\t" \
``` |
||

44 | ```
"paddsw %%xmm1, %%xmm0 \n\t" \
``` |
||

45 | "paddsw "MANGLE(ff_pw_64)", %%xmm0 \n\t" /* Add 64 */ \ |
||

46 | ```
"psraw $7, %%xmm0 \n\t" \
``` |
||

47 | ```
"packuswb %%xmm0, %%xmm0 \n\t" \
``` |
||

48 | ```
"movq %%xmm0, (%1) \n\t" \
``` |
||

49 | |||

50 | void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, |
||

51 | const int16_t *h_weights,const int16_t *v_weights) |
||

52 | { |
||

53 | uint8_t tmp[8*11], *t = tmp; |
||

54 | src -= stride; |
||

55 | |||

56 | ```
__asm__ volatile(
``` |
||

57 | ```
"pxor %%xmm7, %%xmm7 \n\t"
``` |
||

58 | ```
"movq %4, %%xmm3 \n\t"
``` |
||

59 | ```
"pshuflw $0, %%xmm3, %%xmm4 \n\t"
``` |
||

60 | ```
"punpcklqdq %%xmm4, %%xmm4 \n\t"
``` |
||

61 | ```
"pshuflw $85, %%xmm3, %%xmm5 \n\t"
``` |
||

62 | ```
"punpcklqdq %%xmm5, %%xmm5 \n\t"
``` |
||

63 | ```
"pshuflw $170, %%xmm3, %%xmm6 \n\t"
``` |
||

64 | ```
"punpcklqdq %%xmm6, %%xmm6 \n\t"
``` |
||

65 | ```
"pshuflw $255, %%xmm3, %%xmm3 \n\t"
``` |
||

66 | ```
"punpcklqdq %%xmm3, %%xmm3 \n\t"
``` |
||

67 | ```
"1: \n\t"
``` |
||

68 | DIAG4_SSE2(-1,0,1,2) |
||

69 | ```
"add $8, %1 \n\t"
``` |
||

70 | ```
"add %2, %0 \n\t"
``` |
||

71 | ```
"decl %3 \n\t"
``` |
||

72 | ```
"jnz 1b \n\t"
``` |
||

73 | : "+r"(src), "+r"(t) |
||

74 | : "g"((x86_reg)stride), "r"(11), "m"(*(const int64_t*)h_weights) |
||

75 | ```
: "memory");
``` |
||

76 | |||

77 | ```
t = tmp + 8;
``` |
||

78 | |||

79 | ```
__asm__ volatile(
``` |
||

80 | ```
"movq %4, %%xmm3 \n\t"
``` |
||

81 | ```
"pshuflw $0, %%xmm3, %%xmm4 \n\t"
``` |
||

82 | ```
"punpcklqdq %%xmm4, %%xmm4 \n\t"
``` |
||

83 | ```
"pshuflw $85, %%xmm3, %%xmm5 \n\t"
``` |
||

84 | ```
"punpcklqdq %%xmm5, %%xmm5 \n\t"
``` |
||

85 | ```
"pshuflw $170, %%xmm3, %%xmm6 \n\t"
``` |
||

86 | ```
"punpcklqdq %%xmm6, %%xmm6 \n\t"
``` |
||

87 | ```
"pshuflw $255, %%xmm3, %%xmm3 \n\t"
``` |
||

88 | ```
"punpcklqdq %%xmm3, %%xmm3 \n\t"
``` |
||

89 | ```
"1: \n\t"
``` |
||

90 | DIAG4_SSE2(-8,0,8,16) |
||

91 | ```
"add $8, %0 \n\t"
``` |
||

92 | ```
"add %2, %1 \n\t"
``` |
||

93 | ```
"decl %3 \n\t"
``` |
||

94 | ```
"jnz 1b \n\t"
``` |
||

95 | : "+r"(t), "+r"(dst) |
||

96 | : "g"((x86_reg)stride), "r"(8), "m"(*(const int64_t*)v_weights) |
||

97 | ```
: "memory");
``` |
||

98 | } |