Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_deblock.asm @ 2912e87a

History | View | Annotate | Download (22.4 KB)

1
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2008 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
8
;*
9
;* This file is part of Libav.
10
;*
11
;* Libav is free software; you can redistribute it and/or
12
;* modify it under the terms of the GNU Lesser General Public
13
;* License as published by the Free Software Foundation; either
14
;* version 2.1 of the License, or (at your option) any later version.
15
;*
16
;* Libav is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
;* Lesser General Public License for more details.
20
;*
21
;* You should have received a copy of the GNU Lesser General Public
22
;* License along with Libav; if not, write to the Free Software
23
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24
;******************************************************************************
25

    
26
%include "x86inc.asm"
27
%include "x86util.asm"
28

    
29
SECTION_RODATA
30

    
31
cextern pb_0
32
cextern pb_1
33
cextern pb_3
34
cextern pb_A1
35

    
36
SECTION .text
37

    
38
; expands to [base],...,[base+7*stride]
39
%define PASS8ROWS(base, base3, stride, stride3) \
40
    [base], [base+stride], [base+stride*2], [base3], \
41
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
42

    
43
; in: 8 rows of 4 bytes in %1..%8
44
; out: 4 rows of 8 bytes in m0..m3
45
%macro TRANSPOSE4x8_LOAD 8
46
    movd       m0, %1
47
    movd       m2, %2
48
    movd       m1, %3
49
    movd       m3, %4
50
    punpcklbw  m0, m2
51
    punpcklbw  m1, m3
52
    movq       m2, m0
53
    punpcklwd  m0, m1
54
    punpckhwd  m2, m1
55

    
56
    movd       m4, %5
57
    movd       m6, %6
58
    movd       m5, %7
59
    movd       m7, %8
60
    punpcklbw  m4, m6
61
    punpcklbw  m5, m7
62
    movq       m6, m4
63
    punpcklwd  m4, m5
64
    punpckhwd  m6, m5
65

    
66
    movq       m1, m0
67
    movq       m3, m2
68
    punpckldq  m0, m4
69
    punpckhdq  m1, m4
70
    punpckldq  m2, m6
71
    punpckhdq  m3, m6
72
%endmacro
73

    
74
; in: 4 rows of 8 bytes in m0..m3
75
; out: 8 rows of 4 bytes in %1..%8
76
%macro TRANSPOSE8x4_STORE 8
77
    movq       m4, m0
78
    movq       m5, m1
79
    movq       m6, m2
80
    punpckhdq  m4, m4
81
    punpckhdq  m5, m5
82
    punpckhdq  m6, m6
83

    
84
    punpcklbw  m0, m1
85
    punpcklbw  m2, m3
86
    movq       m1, m0
87
    punpcklwd  m0, m2
88
    punpckhwd  m1, m2
89
    movd       %1, m0
90
    punpckhdq  m0, m0
91
    movd       %2, m0
92
    movd       %3, m1
93
    punpckhdq  m1, m1
94
    movd       %4, m1
95

    
96
    punpckhdq  m3, m3
97
    punpcklbw  m4, m5
98
    punpcklbw  m6, m3
99
    movq       m5, m4
100
    punpcklwd  m4, m6
101
    punpckhwd  m5, m6
102
    movd       %5, m4
103
    punpckhdq  m4, m4
104
    movd       %6, m4
105
    movd       %7, m5
106
    punpckhdq  m5, m5
107
    movd       %8, m5
108
%endmacro
109

    
110
%macro SBUTTERFLY3 4
111
    movq       %4, %2
112
    punpckl%1  %2, %3
113
    punpckh%1  %4, %3
114
%endmacro
115

    
116
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
117
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
118
%macro TRANSPOSE6x8_MEM 9
119
    movq  m0, %1
120
    movq  m1, %2
121
    movq  m2, %3
122
    movq  m3, %4
123
    movq  m4, %5
124
    movq  m5, %6
125
    movq  m6, %7
126
    SBUTTERFLY3 bw, m0, m1, m7
127
    SBUTTERFLY3 bw, m2, m3, m1
128
    SBUTTERFLY3 bw, m4, m5, m3
129
    movq  [%9+0x10], m1
130
    SBUTTERFLY3 bw, m6, %8, m5
131
    SBUTTERFLY3 wd, m0, m2, m1
132
    SBUTTERFLY3 wd, m4, m6, m2
133
    punpckhdq m0, m4
134
    movq  [%9+0x00], m0
135
    SBUTTERFLY3 wd, m7, [%9+0x10], m6
136
    SBUTTERFLY3 wd, m3, m5, m4
137
    SBUTTERFLY3 dq, m7, m3, m0
138
    SBUTTERFLY3 dq, m1, m2, m5
139
    punpckldq m6, m4
140
    movq  [%9+0x10], m1
141
    movq  [%9+0x20], m5
142
    movq  [%9+0x30], m7
143
    movq  [%9+0x40], m0
144
    movq  [%9+0x50], m6
145
%endmacro
146

    
147
; in: 8 rows of 8 in %1..%8
148
; out: 8 rows of 8 in %9..%16
149
%macro TRANSPOSE8x8_MEM 16
150
    movq  m0, %1
151
    movq  m1, %2
152
    movq  m2, %3
153
    movq  m3, %4
154
    movq  m4, %5
155
    movq  m5, %6
156
    movq  m6, %7
157
    SBUTTERFLY3 bw, m0, m1, m7
158
    SBUTTERFLY3 bw, m2, m3, m1
159
    SBUTTERFLY3 bw, m4, m5, m3
160
    SBUTTERFLY3 bw, m6, %8, m5
161
    movq  %9,  m3
162
    SBUTTERFLY3 wd, m0, m2, m3
163
    SBUTTERFLY3 wd, m4, m6, m2
164
    SBUTTERFLY3 wd, m7, m1, m6
165
    movq  %11, m2
166
    movq  m2,  %9
167
    SBUTTERFLY3 wd, m2, m5, m1
168
    SBUTTERFLY3 dq, m0, m4, m5
169
    SBUTTERFLY3 dq, m7, m2, m4
170
    movq  %9,  m0
171
    movq  %10, m5
172
    movq  %13, m7
173
    movq  %14, m4
174
    SBUTTERFLY3 dq, m3, %11, m0
175
    SBUTTERFLY3 dq, m6, m1, m5
176
    movq  %11, m3
177
    movq  %12, m0
178
    movq  %15, m6
179
    movq  %16, m5
180
%endmacro
181

    
182
; out: %4 = |%1-%2|>%3
183
; clobbers: %5
184
%macro DIFF_GT 5
185
    mova    %5, %2
186
    mova    %4, %1
187
    psubusb %5, %1
188
    psubusb %4, %2
189
    por     %4, %5
190
    psubusb %4, %3
191
%endmacro
192

    
193
; out: %4 = |%1-%2|>%3
194
; clobbers: %5
195
%macro DIFF_GT2 5
196
    mova    %5, %2
197
    mova    %4, %1
198
    psubusb %5, %1
199
    psubusb %4, %2
200
    psubusb %5, %3
201
    psubusb %4, %3
202
    pcmpeqb %4, %5
203
%endmacro
204

    
205
%macro SPLATW 1
206
%ifidn m0, xmm0
207
    pshuflw  %1, %1, 0
208
    punpcklqdq %1, %1
209
%else
210
    pshufw   %1, %1, 0
211
%endif
212
%endmacro
213

    
214
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
215
; out: m5=beta-1, m7=mask, %3=alpha-1
216
; clobbers: m4,m6
217
%macro LOAD_MASK 2-3
218
    movd     m4, %1
219
    movd     m5, %2
220
    SPLATW   m4
221
    SPLATW   m5
222
    packuswb m4, m4  ; 16x alpha-1
223
    packuswb m5, m5  ; 16x beta-1
224
%if %0>2
225
    mova     %3, m4
226
%endif
227
    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
228
    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
229
    por      m7, m4
230
    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
231
    por      m7, m4
232
    pxor     m6, m6
233
    pcmpeqb  m7, m6
234
%endmacro
235

    
236
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
237
; out: m1=p0' m2=q0'
238
; clobbers: m0,3-6
239
%macro DEBLOCK_P0_Q0 0
240
    mova    m5, m1
241
    pxor    m5, m2       ; p0^q0
242
    pand    m5, [pb_1]   ; (p0^q0)&1
243
    pcmpeqb m4, m4
244
    pxor    m3, m4
245
    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
246
    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
247
    pxor    m4, m1
248
    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
249
    pavgb   m3, m5
250
    paddusb m3, m4       ; d+128+33
251
    mova    m6, [pb_A1]
252
    psubusb m6, m3
253
    psubusb m3, [pb_A1]
254
    pminub  m6, m7
255
    pminub  m3, m7
256
    psubusb m1, m6
257
    psubusb m2, m3
258
    paddusb m1, m3
259
    paddusb m2, m6
260
%endmacro
261

    
262
; in: m1=p0 m2=q0
263
;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
264
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
265
; clobbers: q2, tmp, tc0
266
%macro LUMA_Q1 6
267
    mova    %6, m1
268
    pavgb   %6, m2
269
    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
270
    pxor    %6, %3
271
    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
272
    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
273
    mova    %6, %1
274
    psubusb %6, %5
275
    paddusb %5, %1
276
    pmaxub  %2, %6
277
    pminub  %2, %5
278
    mova    %4, %2
279
%endmacro
280

    
281
%ifdef ARCH_X86_64
282
;-----------------------------------------------------------------------------
283
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
284
;-----------------------------------------------------------------------------
285
INIT_XMM
286
cglobal x264_deblock_v_luma_sse2, 5,5,10
287
    movd    m8, [r4] ; tc0
288
    lea     r4, [r1*3]
289
    dec     r2d        ; alpha-1
290
    neg     r4
291
    dec     r3d        ; beta-1
292
    add     r4, r0     ; pix-3*stride
293

    
294
    mova    m0, [r4+r1]   ; p1
295
    mova    m1, [r4+2*r1] ; p0
296
    mova    m2, [r0]      ; q0
297
    mova    m3, [r0+r1]   ; q1
298
    LOAD_MASK r2d, r3d
299

    
300
    punpcklbw m8, m8
301
    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
302
    pcmpeqb m9, m9
303
    pcmpeqb m9, m8
304
    pandn   m9, m7
305
    pand    m8, m9
306

    
307
    movdqa  m3, [r4] ; p2
308
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
309
    pand    m6, m9
310
    mova    m7, m8
311
    psubb   m7, m6
312
    pand    m6, m8
313
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
314

    
315
    movdqa  m4, [r0+2*r1] ; q2
316
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
317
    pand    m6, m9
318
    pand    m8, m6
319
    psubb   m7, m6
320
    mova    m3, [r0+r1]
321
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
322

    
323
    DEBLOCK_P0_Q0
324
    mova    [r4+2*r1], m1
325
    mova    [r0], m2
326
    RET
327

    
328
;-----------------------------------------------------------------------------
329
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
330
;-----------------------------------------------------------------------------
331
INIT_MMX
332
cglobal x264_deblock_h_luma_sse2, 5,7
333
    movsxd r10, r1d
334
    lea    r11, [r10+r10*2]
335
    lea    r6,  [r0-4]
336
    lea    r5,  [r0-4+r11]
337
%ifdef WIN64
338
    sub    rsp, 0x98
339
    %define pix_tmp rsp+0x30
340
%else
341
    sub    rsp, 0x68
342
    %define pix_tmp rsp
343
%endif
344

    
345
    ; transpose 6x16 -> tmp space
346
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
347
    lea    r6, [r6+r10*8]
348
    lea    r5, [r5+r10*8]
349
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
350

    
351
    ; vertical filter
352
    ; alpha, beta, tc0 are still in r2d, r3d, r4
353
    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
354
    lea    r0, [pix_tmp+0x30]
355
    mov    r1d, 0x10
356
%ifdef WIN64
357
    mov    [rsp+0x20], r4
358
%endif
359
    call   x264_deblock_v_luma_sse2
360

    
361
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
362
    add    r6, 2
363
    add    r5, 2
364
    movq   m0, [pix_tmp+0x18]
365
    movq   m1, [pix_tmp+0x28]
366
    movq   m2, [pix_tmp+0x38]
367
    movq   m3, [pix_tmp+0x48]
368
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
369

    
370
    shl    r10, 3
371
    sub    r6,  r10
372
    sub    r5,  r10
373
    shr    r10, 3
374
    movq   m0, [pix_tmp+0x10]
375
    movq   m1, [pix_tmp+0x20]
376
    movq   m2, [pix_tmp+0x30]
377
    movq   m3, [pix_tmp+0x40]
378
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
379

    
380
%ifdef WIN64
381
    add    rsp, 0x98
382
%else
383
    add    rsp, 0x68
384
%endif
385
    RET
386

    
387
%else
388

    
389
%macro DEBLOCK_LUMA 3
390
;-----------------------------------------------------------------------------
391
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
392
;-----------------------------------------------------------------------------
393
cglobal x264_deblock_%2_luma_%1, 5,5
394
    lea     r4, [r1*3]
395
    dec     r2     ; alpha-1
396
    neg     r4
397
    dec     r3     ; beta-1
398
    add     r4, r0 ; pix-3*stride
399
    %assign pad 2*%3+12-(stack_offset&15)
400
    SUB     esp, pad
401

    
402
    mova    m0, [r4+r1]   ; p1
403
    mova    m1, [r4+2*r1] ; p0
404
    mova    m2, [r0]      ; q0
405
    mova    m3, [r0+r1]   ; q1
406
    LOAD_MASK r2, r3
407

    
408
    mov     r3, r4mp
409
    movd    m4, [r3] ; tc0
410
    punpcklbw m4, m4
411
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
412
    mova   [esp+%3], m4 ; tc
413
    pcmpeqb m3, m3
414
    pcmpgtb m4, m3
415
    pand    m4, m7
416
    mova   [esp], m4 ; mask
417

    
418
    mova    m3, [r4] ; p2
419
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
420
    pand    m6, m4
421
    pand    m4, [esp+%3] ; tc
422
    mova    m7, m4
423
    psubb   m7, m6
424
    pand    m6, m4
425
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
426

    
427
    mova    m4, [r0+2*r1] ; q2
428
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
429
    mova    m5, [esp] ; mask
430
    pand    m6, m5
431
    mova    m5, [esp+%3] ; tc
432
    pand    m5, m6
433
    psubb   m7, m6
434
    mova    m3, [r0+r1]
435
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
436

    
437
    DEBLOCK_P0_Q0
438
    mova    [r4+2*r1], m1
439
    mova    [r0], m2
440
    ADD     esp, pad
441
    RET
442

    
443
;-----------------------------------------------------------------------------
444
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
445
;-----------------------------------------------------------------------------
446
INIT_MMX
447
cglobal x264_deblock_h_luma_%1, 0,5
448
    mov    r0, r0mp
449
    mov    r3, r1m
450
    lea    r4, [r3*3]
451
    sub    r0, 4
452
    lea    r1, [r0+r4]
453
    %assign pad 0x78-(stack_offset&15)
454
    SUB    esp, pad
455
%define pix_tmp esp+12
456

    
457
    ; transpose 6x16 -> tmp space
458
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
459
    lea    r0, [r0+r3*8]
460
    lea    r1, [r1+r3*8]
461
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
462

    
463
    ; vertical filter
464
    lea    r0, [pix_tmp+0x30]
465
    PUSH   dword r4m
466
    PUSH   dword r3m
467
    PUSH   dword r2m
468
    PUSH   dword 16
469
    PUSH   dword r0
470
    call   x264_deblock_%2_luma_%1
471
%ifidn %2, v8
472
    add    dword [esp   ], 8 ; pix_tmp+0x38
473
    add    dword [esp+16], 2 ; tc0+2
474
    call   x264_deblock_%2_luma_%1
475
%endif
476
    ADD    esp, 20
477

    
478
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
479
    mov    r0, r0mp
480
    sub    r0, 2
481
    lea    r1, [r0+r4]
482

    
483
    movq   m0, [pix_tmp+0x10]
484
    movq   m1, [pix_tmp+0x20]
485
    movq   m2, [pix_tmp+0x30]
486
    movq   m3, [pix_tmp+0x40]
487
    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
488

    
489
    lea    r0, [r0+r3*8]
490
    lea    r1, [r1+r3*8]
491
    movq   m0, [pix_tmp+0x18]
492
    movq   m1, [pix_tmp+0x28]
493
    movq   m2, [pix_tmp+0x38]
494
    movq   m3, [pix_tmp+0x48]
495
    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
496

    
497
    ADD    esp, pad
498
    RET
499
%endmacro ; DEBLOCK_LUMA
500

    
501
INIT_MMX
502
DEBLOCK_LUMA mmxext, v8, 8
503
INIT_XMM
504
DEBLOCK_LUMA sse2, v, 16
505

    
506
%endif ; ARCH
507

    
508

    
509

    
510
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
511
    mova  t0, p2
512
    mova  t1, p0
513
    pavgb t0, p1
514
    pavgb t1, q0
515
    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
516
    mova  t5, t1
517
    mova  t2, p2
518
    mova  t3, p0
519
    paddb t2, p1
520
    paddb t3, q0
521
    paddb t2, t3
522
    mova  t3, t2
523
    mova  t4, t2
524
    psrlw t2, 1
525
    pavgb t2, mpb_0
526
    pxor  t2, t0
527
    pand  t2, mpb_1
528
    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
529

    
530
    mova  t1, p2
531
    mova  t2, p2
532
    pavgb t1, q1
533
    psubb t2, q1
534
    paddb t3, t3
535
    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
536
    pand  t2, mpb_1
537
    psubb t1, t2
538
    pavgb t1, p1
539
    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
540
    psrlw t3, 2
541
    pavgb t3, mpb_0
542
    pxor  t3, t1
543
    pand  t3, mpb_1
544
    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
545

    
546
    mova  t3, p0
547
    mova  t2, p0
548
    pxor  t3, q1
549
    pavgb t2, q1
550
    pand  t3, mpb_1
551
    psubb t2, t3
552
    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
553

    
554
    pxor  t1, t2
555
    pxor  t2, p0
556
    pand  t1, mask1p
557
    pand  t2, mask0
558
    pxor  t1, t2
559
    pxor  t1, p0
560
    mova  %1, t1 ; store p0
561

    
562
    mova  t1, %4 ; p3
563
    mova  t2, t1
564
    pavgb t1, p2
565
    paddb t2, p2
566
    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
567
    paddb t2, t2
568
    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
569
    psrlw t2, 2
570
    pavgb t2, mpb_0
571
    pxor  t2, t1
572
    pand  t2, mpb_1
573
    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
574

    
575
    pxor  t0, p1
576
    pxor  t1, p2
577
    pand  t0, mask1p
578
    pand  t1, mask1p
579
    pxor  t0, p1
580
    pxor  t1, p2
581
    mova  %2, t0 ; store p1
582
    mova  %3, t1 ; store p2
583
%endmacro
584

    
585
%macro LUMA_INTRA_SWAP_PQ 0
586
    %define q1 m0
587
    %define q0 m1
588
    %define p0 m2
589
    %define p1 m3
590
    %define p2 q2
591
    %define mask1p mask1q
592
%endmacro
593

    
594
%macro DEBLOCK_LUMA_INTRA 2
595
    %define p1 m0
596
    %define p0 m1
597
    %define q0 m2
598
    %define q1 m3
599
    %define t0 m4
600
    %define t1 m5
601
    %define t2 m6
602
    %define t3 m7
603
%ifdef ARCH_X86_64
604
    %define p2 m8
605
    %define q2 m9
606
    %define t4 m10
607
    %define t5 m11
608
    %define mask0 m12
609
    %define mask1p m13
610
    %define mask1q [rsp-24]
611
    %define mpb_0 m14
612
    %define mpb_1 m15
613
%else
614
    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
615
    %define p2 [r4+r1]
616
    %define q2 [r0+2*r1]
617
    %define t4 spill(0)
618
    %define t5 spill(1)
619
    %define mask0 spill(2)
620
    %define mask1p spill(3)
621
    %define mask1q spill(4)
622
    %define mpb_0 [pb_0]
623
    %define mpb_1 [pb_1]
624
%endif
625

    
626
;-----------------------------------------------------------------------------
627
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
628
;-----------------------------------------------------------------------------
629
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
630
%ifndef ARCH_X86_64
631
    sub     esp, 0x60
632
%endif
633
    lea     r4, [r1*4]
634
    lea     r5, [r1*3] ; 3*stride
635
    dec     r2d        ; alpha-1
636
    jl .end
637
    neg     r4
638
    dec     r3d        ; beta-1
639
    jl .end
640
    add     r4, r0     ; pix-4*stride
641
    mova    p1, [r4+2*r1]
642
    mova    p0, [r4+r5]
643
    mova    q0, [r0]
644
    mova    q1, [r0+r1]
645
%ifdef ARCH_X86_64
646
    pxor    mpb_0, mpb_0
647
    mova    mpb_1, [pb_1]
648
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
649
    SWAP    7, 12 ; m12=mask0
650
    pavgb   t5, mpb_0
651
    pavgb   t5, mpb_1 ; alpha/4+1
652
    movdqa  p2, [r4+r1]
653
    movdqa  q2, [r0+2*r1]
654
    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
655
    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
656
    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
657
    pand    t0, mask0
658
    pand    t4, t0
659
    pand    t2, t0
660
    mova    mask1q, t4
661
    mova    mask1p, t2
662
%else
663
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
664
    mova    m4, t5
665
    mova    mask0, m7
666
    pavgb   m4, [pb_0]
667
    pavgb   m4, [pb_1] ; alpha/4+1
668
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
669
    pand    m6, mask0
670
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
671
    pand    m4, m6
672
    mova    mask1p, m4
673
    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
674
    pand    m4, m6
675
    mova    mask1q, m4
676
%endif
677
    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
678
    LUMA_INTRA_SWAP_PQ
679
    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
680
.end:
681
%ifndef ARCH_X86_64
682
    add     esp, 0x60
683
%endif
684
    RET
685

    
686
INIT_MMX
687
%ifdef ARCH_X86_64
688
;-----------------------------------------------------------------------------
689
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
690
;-----------------------------------------------------------------------------
691
cglobal x264_deblock_h_luma_intra_%1, 4,7
692
    movsxd r10, r1d
693
    lea    r11, [r10*3]
694
    lea    r6,  [r0-4]
695
    lea    r5,  [r0-4+r11]
696
    sub    rsp, 0x88
697
    %define pix_tmp rsp
698

    
699
    ; transpose 8x16 -> tmp space
700
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
701
    lea    r6, [r6+r10*8]
702
    lea    r5, [r5+r10*8]
703
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
704

    
705
    lea    r0,  [pix_tmp+0x40]
706
    mov    r1,  0x10
707
    call   x264_deblock_v_luma_intra_%1
708

    
709
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
710
    lea    r5, [r6+r11]
711
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
712
    shl    r10, 3
713
    sub    r6,  r10
714
    sub    r5,  r10
715
    shr    r10, 3
716
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
717
    add    rsp, 0x88
718
    RET
719
%else
720
cglobal x264_deblock_h_luma_intra_%1, 2,4
721
    lea    r3,  [r1*3]
722
    sub    r0,  4
723
    lea    r2,  [r0+r3]
724
%assign pad 0x8c-(stack_offset&15)
725
    SUB    rsp, pad
726
    %define pix_tmp rsp
727

    
728
    ; transpose 8x16 -> tmp space
729
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
730
    lea    r0,  [r0+r1*8]
731
    lea    r2,  [r2+r1*8]
732
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
733

    
734
    lea    r0,  [pix_tmp+0x40]
735
    PUSH   dword r3m
736
    PUSH   dword r2m
737
    PUSH   dword 16
738
    PUSH   r0
739
    call   x264_deblock_%2_luma_intra_%1
740
%ifidn %2, v8
741
    add    dword [rsp], 8 ; pix_tmp+8
742
    call   x264_deblock_%2_luma_intra_%1
743
%endif
744
    ADD    esp, 16
745

    
746
    mov    r1,  r1m
747
    mov    r0,  r0mp
748
    lea    r3,  [r1*3]
749
    sub    r0,  4
750
    lea    r2,  [r0+r3]
751
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
752
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
753
    lea    r0,  [r0+r1*8]
754
    lea    r2,  [r2+r1*8]
755
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
756
    ADD    rsp, pad
757
    RET
758
%endif ; ARCH_X86_64
759
%endmacro ; DEBLOCK_LUMA_INTRA
760

    
761
INIT_XMM
762
DEBLOCK_LUMA_INTRA sse2, v
763
%ifndef ARCH_X86_64
764
INIT_MMX
765
DEBLOCK_LUMA_INTRA mmxext, v8
766
%endif
767

    
768

    
769

    
770
INIT_MMX
771

    
772
%macro CHROMA_V_START 0
773
    dec    r2d      ; alpha-1
774
    dec    r3d      ; beta-1
775
    mov    t5, r0
776
    sub    t5, r1
777
    sub    t5, r1
778
%endmacro
779

    
780
%macro CHROMA_H_START 0
781
    dec    r2d
782
    dec    r3d
783
    sub    r0, 2
784
    lea    t6, [r1*3]
785
    mov    t5, r0
786
    add    r0, t6
787
%endmacro
788

    
789
%define t5 r5
790
%define t6 r6
791

    
792
;-----------------------------------------------------------------------------
793
; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
794
;-----------------------------------------------------------------------------
795
cglobal x264_deblock_v_chroma_mmxext, 5,6
796
    CHROMA_V_START
797
    movq  m0, [t5]
798
    movq  m1, [t5+r1]
799
    movq  m2, [r0]
800
    movq  m3, [r0+r1]
801
    call x264_chroma_inter_body_mmxext
802
    movq  [t5+r1], m1
803
    movq  [r0], m2
804
    RET
805

    
806
;-----------------------------------------------------------------------------
807
; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
808
;-----------------------------------------------------------------------------
809
cglobal x264_deblock_h_chroma_mmxext, 5,7
810
%ifdef ARCH_X86_64
811
    %define buf0 [rsp-24]
812
    %define buf1 [rsp-16]
813
%else
814
    %define buf0 r0m
815
    %define buf1 r2m
816
%endif
817
    CHROMA_H_START
818
    TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
819
    movq  buf0, m0
820
    movq  buf1, m3
821
    call x264_chroma_inter_body_mmxext
822
    movq  m0, buf0
823
    movq  m3, buf1
824
    TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
825
    RET
826

    
827
ALIGN 16
828
x264_chroma_inter_body_mmxext:
829
    LOAD_MASK  r2d, r3d
830
    movd       m6, [r4] ; tc0
831
    punpcklbw  m6, m6
832
    pand       m7, m6
833
    DEBLOCK_P0_Q0
834
    ret
835

    
836

    
837

    
838
; in: %1=p0 %2=p1 %3=q1
839
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
840
%macro CHROMA_INTRA_P0 3
841
    movq    m4, %1
842
    pxor    m4, %3
843
    pand    m4, [pb_1] ; m4 = (p0^q1)&1
844
    pavgb   %1, %3
845
    psubusb %1, m4
846
    pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
847
%endmacro
848

    
849
%define t5 r4
850
%define t6 r5
851

    
852
;-----------------------------------------------------------------------------
853
; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
854
;-----------------------------------------------------------------------------
855
cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
856
    CHROMA_V_START
857
    movq  m0, [t5]
858
    movq  m1, [t5+r1]
859
    movq  m2, [r0]
860
    movq  m3, [r0+r1]
861
    call x264_chroma_intra_body_mmxext
862
    movq  [t5+r1], m1
863
    movq  [r0], m2
864
    RET
865

    
866
;-----------------------------------------------------------------------------
867
; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
868
;-----------------------------------------------------------------------------
869
cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
870
    CHROMA_H_START
871
    TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
872
    call x264_chroma_intra_body_mmxext
873
    TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
874
    RET
875

    
876
ALIGN 16
877
x264_chroma_intra_body_mmxext:
878
    LOAD_MASK r2d, r3d
879
    movq   m5, m1
880
    movq   m6, m2
881
    CHROMA_INTRA_P0  m1, m0, m3
882
    CHROMA_INTRA_P0  m2, m3, m0
883
    psubb  m1, m5
884
    psubb  m2, m6
885
    pand   m1, m7
886
    pand   m2, m7
887
    paddb  m1, m5
888
    paddb  m2, m6
889
    ret