Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_deblock.asm @ 9f3d6ca4

History | View | Annotate | Download (23.5 KB)

1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
8
;*          Oskar Arvidsson <oskar@irock.se>
9
;*
10
;* This file is part of Libav.
11
;*
12
;* Libav is free software; you can redistribute it and/or
13
;* modify it under the terms of the GNU Lesser General Public
14
;* License as published by the Free Software Foundation; either
15
;* version 2.1 of the License, or (at your option) any later version.
16
;*
17
;* Libav is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
;* Lesser General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU Lesser General Public
23
;* License along with Libav; if not, write to the Free Software
24
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
;******************************************************************************
26

    
27
%include "x86inc.asm"
28
%include "x86util.asm"
29

    
30
SECTION .text
31

    
32
cextern pb_0
33
cextern pb_1
34
cextern pb_3
35
cextern pb_A1
36

    
37
; expands to [base],...,[base+7*stride]
38
%define PASS8ROWS(base, base3, stride, stride3) \
39
    [base], [base+stride], [base+stride*2], [base3], \
40
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
41

    
42
%define PASS8ROWS(base, base3, stride, stride3, offset) \
43
    PASS8ROWS(base+offset, base3+offset, stride, stride3)
44

    
45
; in: 8 rows of 4 bytes in %4..%11
46
; out: 4 rows of 8 bytes in m0..m3
47
%macro TRANSPOSE4x8_LOAD 11
48
    movh       m0, %4
49
    movh       m2, %5
50
    movh       m1, %6
51
    movh       m3, %7
52
    punpckl%1  m0, m2
53
    punpckl%1  m1, m3
54
    mova       m2, m0
55
    punpckl%2  m0, m1
56
    punpckh%2  m2, m1
57

    
58
    movh       m4, %8
59
    movh       m6, %9
60
    movh       m5, %10
61
    movh       m7, %11
62
    punpckl%1  m4, m6
63
    punpckl%1  m5, m7
64
    mova       m6, m4
65
    punpckl%2  m4, m5
66
    punpckh%2  m6, m5
67

    
68
    punpckh%3  m1, m0, m4
69
    punpckh%3  m3, m2, m6
70
    punpckl%3  m0, m4
71
    punpckl%3  m2, m6
72
%endmacro
73

    
74
; in: 4 rows of 8 bytes in m0..m3
75
; out: 8 rows of 4 bytes in %1..%8
76
%macro TRANSPOSE8x4B_STORE 8
77
    punpckhdq  m4, m0, m0
78
    punpckhdq  m5, m1, m1
79
    punpckhdq  m6, m2, m2
80

    
81
    punpcklbw  m0, m1
82
    punpcklbw  m2, m3
83
    punpcklwd  m1, m0, m2
84
    punpckhwd  m0, m2
85
    movh       %1, m1
86
    punpckhdq  m1, m1
87
    movh       %2, m1
88
    movh       %3, m0
89
    punpckhdq  m0, m0
90
    movh       %4, m0
91

    
92
    punpckhdq  m3, m3
93
    punpcklbw  m4, m5
94
    punpcklbw  m6, m3
95
    punpcklwd  m5, m4, m6
96
    punpckhwd  m4, m6
97
    movh       %5, m5
98
    punpckhdq  m5, m5
99
    movh       %6, m5
100
    movh       %7, m4
101
    punpckhdq  m4, m4
102
    movh       %8, m4
103
%endmacro
104

    
105
%macro TRANSPOSE4x8B_LOAD 8
106
    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
107
%endmacro
108

    
109
%macro TRANSPOSE4x8W_LOAD 8
110
%if mmsize==16
111
    TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
112
%else
113
    SWAP  1, 4, 2, 3
114
    mova  m0, [t5]
115
    mova  m1, [t5+r1]
116
    mova  m2, [t5+r1*2]
117
    mova  m3, [t5+t6]
118
    TRANSPOSE4x4W 0, 1, 2, 3, 4
119
%endif
120
%endmacro
121

    
122
%macro TRANSPOSE8x2W_STORE 8
123
    punpckhwd  m0, m1, m2
124
    punpcklwd  m1, m2
125
%if mmsize==8
126
    movd       %3, m0
127
    movd       %1, m1
128
    psrlq      m1, 32
129
    psrlq      m0, 32
130
    movd       %2, m1
131
    movd       %4, m0
132
%else
133
    movd       %5, m0
134
    movd       %1, m1
135
    psrldq     m1, 4
136
    psrldq     m0, 4
137
    movd       %2, m1
138
    movd       %6, m0
139
    psrldq     m1, 4
140
    psrldq     m0, 4
141
    movd       %3, m1
142
    movd       %7, m0
143
    psrldq     m1, 4
144
    psrldq     m0, 4
145
    movd       %4, m1
146
    movd       %8, m0
147
%endif
148
%endmacro
149

    
150
%macro SBUTTERFLY3 4
151
    punpckh%1  %4, %2, %3
152
    punpckl%1  %2, %3
153
%endmacro
154

    
155
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
156
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
157
%macro TRANSPOSE6x8_MEM 9
158
    RESET_MM_PERMUTATION
159
    movq  m0, %1
160
    movq  m1, %2
161
    movq  m2, %3
162
    movq  m3, %4
163
    movq  m4, %5
164
    movq  m5, %6
165
    movq  m6, %7
166
    SBUTTERFLY bw, 0, 1, 7
167
    SBUTTERFLY bw, 2, 3, 7
168
    SBUTTERFLY bw, 4, 5, 7
169
    movq  [%9+0x10], m3
170
    SBUTTERFLY3 bw, m6, %8, m7
171
    SBUTTERFLY wd, 0, 2, 3
172
    SBUTTERFLY wd, 4, 6, 3
173
    punpckhdq m0, m4
174
    movq  [%9+0x00], m0
175
    SBUTTERFLY3 wd, m1, [%9+0x10], m3
176
    SBUTTERFLY wd, 5, 7, 0
177
    SBUTTERFLY dq, 1, 5, 0
178
    SBUTTERFLY dq, 2, 6, 0
179
    punpckldq m3, m7
180
    movq  [%9+0x10], m2
181
    movq  [%9+0x20], m6
182
    movq  [%9+0x30], m1
183
    movq  [%9+0x40], m5
184
    movq  [%9+0x50], m3
185
    RESET_MM_PERMUTATION
186
%endmacro
187

    
188
; in: 8 rows of 8 in %1..%8
189
; out: 8 rows of 8 in %9..%16
190
%macro TRANSPOSE8x8_MEM 16
191
    RESET_MM_PERMUTATION
192
    movq  m0, %1
193
    movq  m1, %2
194
    movq  m2, %3
195
    movq  m3, %4
196
    movq  m4, %5
197
    movq  m5, %6
198
    movq  m6, %7
199
    SBUTTERFLY bw, 0, 1, 7
200
    SBUTTERFLY bw, 2, 3, 7
201
    SBUTTERFLY bw, 4, 5, 7
202
    SBUTTERFLY3 bw, m6, %8, m7
203
    movq  %9,  m5
204
    SBUTTERFLY wd, 0, 2, 5
205
    SBUTTERFLY wd, 4, 6, 5
206
    SBUTTERFLY wd, 1, 3, 5
207
    movq  %11, m6
208
    movq  m6,  %9
209
    SBUTTERFLY wd, 6, 7, 5
210
    SBUTTERFLY dq, 0, 4, 5
211
    SBUTTERFLY dq, 1, 6, 5
212
    movq  %9,  m0
213
    movq  %10, m4
214
    movq  %13, m1
215
    movq  %14, m6
216
    SBUTTERFLY3 dq, m2, %11, m0
217
    SBUTTERFLY dq, 3, 7, 4
218
    movq  %11, m2
219
    movq  %12, m0
220
    movq  %15, m3
221
    movq  %16, m7
222
    RESET_MM_PERMUTATION
223
%endmacro
224

    
225
; out: %4 = |%1-%2|>%3
226
; clobbers: %5
227
%macro DIFF_GT 5
228
%if avx_enabled == 0
229
    mova    %5, %2
230
    mova    %4, %1
231
    psubusb %5, %1
232
    psubusb %4, %2
233
%else
234
    psubusb %5, %2, %1
235
    psubusb %4, %1, %2
236
%endif
237
    por     %4, %5
238
    psubusb %4, %3
239
%endmacro
240

    
241
; out: %4 = |%1-%2|>%3
242
; clobbers: %5
243
%macro DIFF_GT2 5
244
%ifdef ARCH_X86_64
245
    psubusb %5, %2, %1
246
    psubusb %4, %1, %2
247
%else
248
    mova    %5, %2
249
    mova    %4, %1
250
    psubusb %5, %1
251
    psubusb %4, %2
252
%endif
253
    psubusb %5, %3
254
    psubusb %4, %3
255
    pcmpeqb %4, %5
256
%endmacro
257

    
258
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
259
; out: m5=beta-1, m7=mask, %3=alpha-1
260
; clobbers: m4,m6
261
%macro LOAD_MASK 2-3
262
    movd     m4, %1
263
    movd     m5, %2
264
    SPLATW   m4, m4
265
    SPLATW   m5, m5
266
    packuswb m4, m4  ; 16x alpha-1
267
    packuswb m5, m5  ; 16x beta-1
268
%if %0>2
269
    mova     %3, m4
270
%endif
271
    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
272
    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
273
    por      m7, m4
274
    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
275
    por      m7, m4
276
    pxor     m6, m6
277
    pcmpeqb  m7, m6
278
%endmacro
279

    
280
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
281
; out: m1=p0' m2=q0'
282
; clobbers: m0,3-6
283
%macro DEBLOCK_P0_Q0 0
284
    pxor    m5, m1, m2   ; p0^q0
285
    pand    m5, [pb_1]   ; (p0^q0)&1
286
    pcmpeqb m4, m4
287
    pxor    m3, m4
288
    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
289
    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
290
    pxor    m4, m1
291
    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
292
    pavgb   m3, m5
293
    paddusb m3, m4       ; d+128+33
294
    mova    m6, [pb_A1]
295
    psubusb m6, m3
296
    psubusb m3, [pb_A1]
297
    pminub  m6, m7
298
    pminub  m3, m7
299
    psubusb m1, m6
300
    psubusb m2, m3
301
    paddusb m1, m3
302
    paddusb m2, m6
303
%endmacro
304

    
305
; in: m1=p0 m2=q0
306
;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
307
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
308
; clobbers: q2, tmp, tc0
309
%macro LUMA_Q1 6
310
    pavgb   %6, m1, m2
311
    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
312
    pxor    %6, %3
313
    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
314
    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
315
    psubusb %6, %1, %5
316
    paddusb %5, %1
317
    pmaxub  %2, %6
318
    pminub  %2, %5
319
    mova    %4, %2
320
%endmacro
321

    
322
%ifdef ARCH_X86_64
323
;-----------------------------------------------------------------------------
324
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325
;-----------------------------------------------------------------------------
326
%macro DEBLOCK_LUMA 1
327
cglobal deblock_v_luma_8_%1, 5,5,10
328
    movd    m8, [r4] ; tc0
329
    lea     r4, [r1*3]
330
    dec     r2d        ; alpha-1
331
    neg     r4
332
    dec     r3d        ; beta-1
333
    add     r4, r0     ; pix-3*stride
334

    
335
    mova    m0, [r4+r1]   ; p1
336
    mova    m1, [r4+2*r1] ; p0
337
    mova    m2, [r0]      ; q0
338
    mova    m3, [r0+r1]   ; q1
339
    LOAD_MASK r2d, r3d
340

    
341
    punpcklbw m8, m8
342
    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
343
    pcmpeqb m9, m9
344
    pcmpeqb m9, m8
345
    pandn   m9, m7
346
    pand    m8, m9
347

    
348
    movdqa  m3, [r4] ; p2
349
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
350
    pand    m6, m9
351
    psubb   m7, m8, m6
352
    pand    m6, m8
353
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
354

    
355
    movdqa  m4, [r0+2*r1] ; q2
356
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
357
    pand    m6, m9
358
    pand    m8, m6
359
    psubb   m7, m6
360
    mova    m3, [r0+r1]
361
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
362

    
363
    DEBLOCK_P0_Q0
364
    mova    [r4+2*r1], m1
365
    mova    [r0], m2
366
    RET
367

    
368
;-----------------------------------------------------------------------------
369
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
370
;-----------------------------------------------------------------------------
371
INIT_MMX
372
cglobal deblock_h_luma_8_%1, 5,7
373
    movsxd r10, r1d
374
    lea    r11, [r10+r10*2]
375
    lea    r6,  [r0-4]
376
    lea    r5,  [r0-4+r11]
377
%ifdef WIN64
378
    sub    rsp, 0x98
379
    %define pix_tmp rsp+0x30
380
%else
381
    sub    rsp, 0x68
382
    %define pix_tmp rsp
383
%endif
384

    
385
    ; transpose 6x16 -> tmp space
386
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
387
    lea    r6, [r6+r10*8]
388
    lea    r5, [r5+r10*8]
389
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
390

    
391
    ; vertical filter
392
    ; alpha, beta, tc0 are still in r2d, r3d, r4
393
    ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
394
    lea    r0, [pix_tmp+0x30]
395
    mov    r1d, 0x10
396
%ifdef WIN64
397
    mov    [rsp+0x20], r4
398
%endif
399
    call   deblock_v_luma_8_%1
400

    
401
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
402
    add    r6, 2
403
    add    r5, 2
404
    movq   m0, [pix_tmp+0x18]
405
    movq   m1, [pix_tmp+0x28]
406
    movq   m2, [pix_tmp+0x38]
407
    movq   m3, [pix_tmp+0x48]
408
    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)
409

    
410
    shl    r10, 3
411
    sub    r6,  r10
412
    sub    r5,  r10
413
    shr    r10, 3
414
    movq   m0, [pix_tmp+0x10]
415
    movq   m1, [pix_tmp+0x20]
416
    movq   m2, [pix_tmp+0x30]
417
    movq   m3, [pix_tmp+0x40]
418
    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)
419

    
420
%ifdef WIN64
421
    add    rsp, 0x98
422
%else
423
    add    rsp, 0x68
424
%endif
425
    RET
426
%endmacro
427

    
428
INIT_XMM
429
DEBLOCK_LUMA sse2
430
INIT_AVX
431
DEBLOCK_LUMA avx
432

    
433
%else
434

    
435
%macro DEBLOCK_LUMA 3
436
;-----------------------------------------------------------------------------
437
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
438
;-----------------------------------------------------------------------------
439
cglobal deblock_%2_luma_8_%1, 5,5
440
    lea     r4, [r1*3]
441
    dec     r2     ; alpha-1
442
    neg     r4
443
    dec     r3     ; beta-1
444
    add     r4, r0 ; pix-3*stride
445
    %assign pad 2*%3+12-(stack_offset&15)
446
    SUB     esp, pad
447

    
448
    mova    m0, [r4+r1]   ; p1
449
    mova    m1, [r4+2*r1] ; p0
450
    mova    m2, [r0]      ; q0
451
    mova    m3, [r0+r1]   ; q1
452
    LOAD_MASK r2, r3
453

    
454
    mov     r3, r4mp
455
    movd    m4, [r3] ; tc0
456
    punpcklbw m4, m4
457
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
458
    mova   [esp+%3], m4 ; tc
459
    pcmpeqb m3, m3
460
    pcmpgtb m4, m3
461
    pand    m4, m7
462
    mova   [esp], m4 ; mask
463

    
464
    mova    m3, [r4] ; p2
465
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
466
    pand    m6, m4
467
    pand    m4, [esp+%3] ; tc
468
    psubb   m7, m4, m6
469
    pand    m6, m4
470
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
471

    
472
    mova    m4, [r0+2*r1] ; q2
473
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
474
    mova    m5, [esp] ; mask
475
    pand    m6, m5
476
    mova    m5, [esp+%3] ; tc
477
    pand    m5, m6
478
    psubb   m7, m6
479
    mova    m3, [r0+r1]
480
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
481

    
482
    DEBLOCK_P0_Q0
483
    mova    [r4+2*r1], m1
484
    mova    [r0], m2
485
    ADD     esp, pad
486
    RET
487

    
488
;-----------------------------------------------------------------------------
489
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
490
;-----------------------------------------------------------------------------
491
INIT_MMX
492
cglobal deblock_h_luma_8_%1, 0,5
493
    mov    r0, r0mp
494
    mov    r3, r1m
495
    lea    r4, [r3*3]
496
    sub    r0, 4
497
    lea    r1, [r0+r4]
498
    %assign pad 0x78-(stack_offset&15)
499
    SUB    esp, pad
500
%define pix_tmp esp+12
501

    
502
    ; transpose 6x16 -> tmp space
503
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
504
    lea    r0, [r0+r3*8]
505
    lea    r1, [r1+r3*8]
506
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
507

    
508
    ; vertical filter
509
    lea    r0, [pix_tmp+0x30]
510
    PUSH   dword r4m
511
    PUSH   dword r3m
512
    PUSH   dword r2m
513
    PUSH   dword 16
514
    PUSH   dword r0
515
    call   deblock_%2_luma_8_%1
516
%ifidn %2, v8
517
    add    dword [esp   ], 8 ; pix_tmp+0x38
518
    add    dword [esp+16], 2 ; tc0+2
519
    call   deblock_%2_luma_8_%1
520
%endif
521
    ADD    esp, 20
522

    
523
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
524
    mov    r0, r0mp
525
    sub    r0, 2
526
    lea    r1, [r0+r4]
527

    
528
    movq   m0, [pix_tmp+0x10]
529
    movq   m1, [pix_tmp+0x20]
530
    movq   m2, [pix_tmp+0x30]
531
    movq   m3, [pix_tmp+0x40]
532
    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
533

    
534
    lea    r0, [r0+r3*8]
535
    lea    r1, [r1+r3*8]
536
    movq   m0, [pix_tmp+0x18]
537
    movq   m1, [pix_tmp+0x28]
538
    movq   m2, [pix_tmp+0x38]
539
    movq   m3, [pix_tmp+0x48]
540
    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
541

    
542
    ADD    esp, pad
543
    RET
544
%endmacro ; DEBLOCK_LUMA
545

    
546
INIT_MMX
547
DEBLOCK_LUMA mmxext, v8, 8
548
INIT_XMM
549
DEBLOCK_LUMA sse2, v, 16
550
INIT_AVX
551
DEBLOCK_LUMA avx, v, 16
552

    
553
%endif ; ARCH
554

    
555

    
556

    
557
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
558
%ifdef ARCH_X86_64
559
    pavgb t0, p2, p1
560
    pavgb t1, p0, q0
561
%else
562
    mova  t0, p2
563
    mova  t1, p0
564
    pavgb t0, p1
565
    pavgb t1, q0
566
%endif
567
    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
568
    mova  t5, t1
569
%ifdef ARCH_X86_64
570
    paddb t2, p2, p1
571
    paddb t3, p0, q0
572
%else
573
    mova  t2, p2
574
    mova  t3, p0
575
    paddb t2, p1
576
    paddb t3, q0
577
%endif
578
    paddb t2, t3
579
    mova  t3, t2
580
    mova  t4, t2
581
    psrlw t2, 1
582
    pavgb t2, mpb_0
583
    pxor  t2, t0
584
    pand  t2, mpb_1
585
    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
586

    
587
%ifdef ARCH_X86_64
588
    pavgb t1, p2, q1
589
    psubb t2, p2, q1
590
%else
591
    mova  t1, p2
592
    mova  t2, p2
593
    pavgb t1, q1
594
    psubb t2, q1
595
%endif
596
    paddb t3, t3
597
    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
598
    pand  t2, mpb_1
599
    psubb t1, t2
600
    pavgb t1, p1
601
    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
602
    psrlw t3, 2
603
    pavgb t3, mpb_0
604
    pxor  t3, t1
605
    pand  t3, mpb_1
606
    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
607

    
608
    pxor  t3, p0, q1
609
    pavgb t2, p0, q1
610
    pand  t3, mpb_1
611
    psubb t2, t3
612
    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
613

    
614
    pxor  t1, t2
615
    pxor  t2, p0
616
    pand  t1, mask1p
617
    pand  t2, mask0
618
    pxor  t1, t2
619
    pxor  t1, p0
620
    mova  %1, t1 ; store p0
621

    
622
    mova  t1, %4 ; p3
623
    paddb t2, t1, p2
624
    pavgb t1, p2
625
    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
626
    paddb t2, t2
627
    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
628
    psrlw t2, 2
629
    pavgb t2, mpb_0
630
    pxor  t2, t1
631
    pand  t2, mpb_1
632
    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
633

    
634
    pxor  t0, p1
635
    pxor  t1, p2
636
    pand  t0, mask1p
637
    pand  t1, mask1p
638
    pxor  t0, p1
639
    pxor  t1, p2
640
    mova  %2, t0 ; store p1
641
    mova  %3, t1 ; store p2
642
%endmacro
643

    
644
%macro LUMA_INTRA_SWAP_PQ 0
645
    %define q1 m0
646
    %define q0 m1
647
    %define p0 m2
648
    %define p1 m3
649
    %define p2 q2
650
    %define mask1p mask1q
651
%endmacro
652

    
653
%macro DEBLOCK_LUMA_INTRA 2
654
    %define p1 m0
655
    %define p0 m1
656
    %define q0 m2
657
    %define q1 m3
658
    %define t0 m4
659
    %define t1 m5
660
    %define t2 m6
661
    %define t3 m7
662
%ifdef ARCH_X86_64
663
    %define p2 m8
664
    %define q2 m9
665
    %define t4 m10
666
    %define t5 m11
667
    %define mask0 m12
668
    %define mask1p m13
669
    %define mask1q [rsp-24]
670
    %define mpb_0 m14
671
    %define mpb_1 m15
672
%else
673
    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
674
    %define p2 [r4+r1]
675
    %define q2 [r0+2*r1]
676
    %define t4 spill(0)
677
    %define t5 spill(1)
678
    %define mask0 spill(2)
679
    %define mask1p spill(3)
680
    %define mask1q spill(4)
681
    %define mpb_0 [pb_0]
682
    %define mpb_1 [pb_1]
683
%endif
684

    
685
;-----------------------------------------------------------------------------
686
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
687
;-----------------------------------------------------------------------------
688
cglobal deblock_%2_luma_intra_8_%1, 4,6,16
689
%ifndef ARCH_X86_64
690
    sub     esp, 0x60
691
%endif
692
    lea     r4, [r1*4]
693
    lea     r5, [r1*3] ; 3*stride
694
    dec     r2d        ; alpha-1
695
    jl .end
696
    neg     r4
697
    dec     r3d        ; beta-1
698
    jl .end
699
    add     r4, r0     ; pix-4*stride
700
    mova    p1, [r4+2*r1]
701
    mova    p0, [r4+r5]
702
    mova    q0, [r0]
703
    mova    q1, [r0+r1]
704
%ifdef ARCH_X86_64
705
    pxor    mpb_0, mpb_0
706
    mova    mpb_1, [pb_1]
707
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
708
    SWAP    7, 12 ; m12=mask0
709
    pavgb   t5, mpb_0
710
    pavgb   t5, mpb_1 ; alpha/4+1
711
    movdqa  p2, [r4+r1]
712
    movdqa  q2, [r0+2*r1]
713
    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
714
    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
715
    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
716
    pand    t0, mask0
717
    pand    t4, t0
718
    pand    t2, t0
719
    mova    mask1q, t4
720
    mova    mask1p, t2
721
%else
722
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
723
    mova    m4, t5
724
    mova    mask0, m7
725
    pavgb   m4, [pb_0]
726
    pavgb   m4, [pb_1] ; alpha/4+1
727
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
728
    pand    m6, mask0
729
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
730
    pand    m4, m6
731
    mova    mask1p, m4
732
    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
733
    pand    m4, m6
734
    mova    mask1q, m4
735
%endif
736
    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
737
    LUMA_INTRA_SWAP_PQ
738
    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
739
.end:
740
%ifndef ARCH_X86_64
741
    add     esp, 0x60
742
%endif
743
    RET
744

    
745
INIT_MMX
746
%ifdef ARCH_X86_64
747
;-----------------------------------------------------------------------------
748
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
749
;-----------------------------------------------------------------------------
750
cglobal deblock_h_luma_intra_8_%1, 4,7
751
    movsxd r10, r1d
752
    lea    r11, [r10*3]
753
    lea    r6,  [r0-4]
754
    lea    r5,  [r0-4+r11]
755
    sub    rsp, 0x88
756
    %define pix_tmp rsp
757

    
758
    ; transpose 8x16 -> tmp space
759
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
760
    lea    r6, [r6+r10*8]
761
    lea    r5, [r5+r10*8]
762
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
763

    
764
    lea    r0,  [pix_tmp+0x40]
765
    mov    r1,  0x10
766
    call   deblock_v_luma_intra_8_%1
767

    
768
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
769
    lea    r5, [r6+r11]
770
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
771
    shl    r10, 3
772
    sub    r6,  r10
773
    sub    r5,  r10
774
    shr    r10, 3
775
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
776
    add    rsp, 0x88
777
    RET
778
%else
779
cglobal deblock_h_luma_intra_8_%1, 2,4
780
    lea    r3,  [r1*3]
781
    sub    r0,  4
782
    lea    r2,  [r0+r3]
783
%assign pad 0x8c-(stack_offset&15)
784
    SUB    rsp, pad
785
    %define pix_tmp rsp
786

    
787
    ; transpose 8x16 -> tmp space
788
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
789
    lea    r0,  [r0+r1*8]
790
    lea    r2,  [r2+r1*8]
791
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
792

    
793
    lea    r0,  [pix_tmp+0x40]
794
    PUSH   dword r3m
795
    PUSH   dword r2m
796
    PUSH   dword 16
797
    PUSH   r0
798
    call   deblock_%2_luma_intra_8_%1
799
%ifidn %2, v8
800
    add    dword [rsp], 8 ; pix_tmp+8
801
    call   deblock_%2_luma_intra_8_%1
802
%endif
803
    ADD    esp, 16
804

    
805
    mov    r1,  r1m
806
    mov    r0,  r0mp
807
    lea    r3,  [r1*3]
808
    sub    r0,  4
809
    lea    r2,  [r0+r3]
810
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
811
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
812
    lea    r0,  [r0+r1*8]
813
    lea    r2,  [r2+r1*8]
814
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
815
    ADD    rsp, pad
816
    RET
817
%endif ; ARCH_X86_64
818
%endmacro ; DEBLOCK_LUMA_INTRA
819

    
820
INIT_XMM
821
DEBLOCK_LUMA_INTRA sse2, v
822
INIT_AVX
823
DEBLOCK_LUMA_INTRA avx , v
824
%ifndef ARCH_X86_64
825
INIT_MMX
826
DEBLOCK_LUMA_INTRA mmxext, v8
827
%endif
828

    
829
INIT_MMX
830

    
831
%macro CHROMA_V_START 0
832
    dec    r2d      ; alpha-1
833
    dec    r3d      ; beta-1
834
    mov    t5, r0
835
    sub    t5, r1
836
    sub    t5, r1
837
%endmacro
838

    
839
%macro CHROMA_H_START 0
840
    dec    r2d
841
    dec    r3d
842
    sub    r0, 2
843
    lea    t6, [r1*3]
844
    mov    t5, r0
845
    add    r0, t6
846
%endmacro
847

    
848
%define t5 r5
849
%define t6 r6
850

    
851
;-----------------------------------------------------------------------------
852
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
853
;-----------------------------------------------------------------------------
854
cglobal deblock_v_chroma_8_mmxext, 5,6
855
    CHROMA_V_START
856
    movq  m0, [t5]
857
    movq  m1, [t5+r1]
858
    movq  m2, [r0]
859
    movq  m3, [r0+r1]
860
    call ff_chroma_inter_body_mmxext
861
    movq  [t5+r1], m1
862
    movq  [r0], m2
863
    RET
864

    
865
;-----------------------------------------------------------------------------
866
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
867
;-----------------------------------------------------------------------------
868
cglobal deblock_h_chroma_8_mmxext, 5,7
869
%ifdef ARCH_X86_64
870
    %define buf0 [rsp-24]
871
    %define buf1 [rsp-16]
872
%else
873
    %define buf0 r0m
874
    %define buf1 r2m
875
%endif
876
    CHROMA_H_START
877
    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
878
    movq  buf0, m0
879
    movq  buf1, m3
880
    call ff_chroma_inter_body_mmxext
881
    movq  m0, buf0
882
    movq  m3, buf1
883
    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
884
    RET
885

    
886
ALIGN 16
887
ff_chroma_inter_body_mmxext:
888
    LOAD_MASK  r2d, r3d
889
    movd       m6, [r4] ; tc0
890
    punpcklbw  m6, m6
891
    pand       m7, m6
892
    DEBLOCK_P0_Q0
893
    ret
894

    
895

    
896

    
897
; in: %1=p0 %2=p1 %3=q1
898
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
899
%macro CHROMA_INTRA_P0 3
900
    movq    m4, %1
901
    pxor    m4, %3
902
    pand    m4, [pb_1] ; m4 = (p0^q1)&1
903
    pavgb   %1, %3
904
    psubusb %1, m4
905
    pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
906
%endmacro
907

    
908
%define t5 r4
909
%define t6 r5
910

    
911
;-----------------------------------------------------------------------------
912
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
913
;-----------------------------------------------------------------------------
914
cglobal deblock_v_chroma_intra_8_mmxext, 4,5
915
    CHROMA_V_START
916
    movq  m0, [t5]
917
    movq  m1, [t5+r1]
918
    movq  m2, [r0]
919
    movq  m3, [r0+r1]
920
    call ff_chroma_intra_body_mmxext
921
    movq  [t5+r1], m1
922
    movq  [r0], m2
923
    RET
924

    
925
;-----------------------------------------------------------------------------
926
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
927
;-----------------------------------------------------------------------------
928
cglobal deblock_h_chroma_intra_8_mmxext, 4,6
929
    CHROMA_H_START
930
    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
931
    call ff_chroma_intra_body_mmxext
932
    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
933
    RET
934

    
935
ALIGN 16
936
ff_chroma_intra_body_mmxext:
937
    LOAD_MASK r2d, r3d
938
    movq   m5, m1
939
    movq   m6, m2
940
    CHROMA_INTRA_P0  m1, m0, m3
941
    CHROMA_INTRA_P0  m2, m3, m0
942
    psubb  m1, m5
943
    psubb  m2, m6
944
    pand   m1, m7
945
    pand   m2, m7
946
    paddb  m1, m5
947
    paddb  m2, m6
948
    ret