Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / h264_deblock_sse2.asm @ c4ff7c53

History | View | Annotate | Download (19.8 KB)

1
;*****************************************************************************
2
;* deblock-a.asm: h264 encoder library
3
;*****************************************************************************
4
;* Copyright (C) 2005-2008 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*
8
;* This program is free software; you can redistribute it and/or modify
9
;* it under the terms of the GNU General Public License as published by
10
;* the Free Software Foundation; either version 2 of the License, or
11
;* (at your option) any later version.
12
;*
13
;* This program is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
;* GNU General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU General Public License
19
;* along with this program; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
;*****************************************************************************
22

    
23
%include "x86inc.asm"
24

    
25
SECTION_RODATA
26
pb_00: times 16 db 0x00
27
pb_01: times 16 db 0x01
28
pb_03: times 16 db 0x03
29
pb_a1: times 16 db 0xa1
30

    
31
SECTION .text
32

    
33
; expands to [base],...,[base+7*stride]
34
%define PASS8ROWS(base, base3, stride, stride3) \
35
    [base], [base+stride], [base+stride*2], [base3], \
36
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
37

    
38
; in: 8 rows of 4 bytes in %1..%8
39
; out: 4 rows of 8 bytes in m0..m3
40
%macro TRANSPOSE4x8_LOAD 8
41
    movd       m0, %1
42
    movd       m2, %2
43
    movd       m1, %3
44
    movd       m3, %4
45
    punpcklbw  m0, m2
46
    punpcklbw  m1, m3
47
    movq       m2, m0
48
    punpcklwd  m0, m1
49
    punpckhwd  m2, m1
50

    
51
    movd       m4, %5
52
    movd       m6, %6
53
    movd       m5, %7
54
    movd       m7, %8
55
    punpcklbw  m4, m6
56
    punpcklbw  m5, m7
57
    movq       m6, m4
58
    punpcklwd  m4, m5
59
    punpckhwd  m6, m5
60

    
61
    movq       m1, m0
62
    movq       m3, m2
63
    punpckldq  m0, m4
64
    punpckhdq  m1, m4
65
    punpckldq  m2, m6
66
    punpckhdq  m3, m6
67
%endmacro
68

    
69
; in: 4 rows of 8 bytes in m0..m3
70
; out: 8 rows of 4 bytes in %1..%8
71
%macro TRANSPOSE8x4_STORE 8
72
    movq       m4, m0
73
    movq       m5, m1
74
    movq       m6, m2
75
    punpckhdq  m4, m4
76
    punpckhdq  m5, m5
77
    punpckhdq  m6, m6
78

    
79
    punpcklbw  m0, m1
80
    punpcklbw  m2, m3
81
    movq       m1, m0
82
    punpcklwd  m0, m2
83
    punpckhwd  m1, m2
84
    movd       %1, m0
85
    punpckhdq  m0, m0
86
    movd       %2, m0
87
    movd       %3, m1
88
    punpckhdq  m1, m1
89
    movd       %4, m1
90

    
91
    punpckhdq  m3, m3
92
    punpcklbw  m4, m5
93
    punpcklbw  m6, m3
94
    movq       m5, m4
95
    punpcklwd  m4, m6
96
    punpckhwd  m5, m6
97
    movd       %5, m4
98
    punpckhdq  m4, m4
99
    movd       %6, m4
100
    movd       %7, m5
101
    punpckhdq  m5, m5
102
    movd       %8, m5
103
%endmacro
104

    
105
%macro SBUTTERFLY 4
106
    movq       %4, %2
107
    punpckl%1  %2, %3
108
    punpckh%1  %4, %3
109
%endmacro
110

    
111
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113
%macro TRANSPOSE6x8_MEM 9
114
    movq  m0, %1
115
    movq  m1, %2
116
    movq  m2, %3
117
    movq  m3, %4
118
    movq  m4, %5
119
    movq  m5, %6
120
    movq  m6, %7
121
    SBUTTERFLY bw, m0, m1, m7
122
    SBUTTERFLY bw, m2, m3, m1
123
    SBUTTERFLY bw, m4, m5, m3
124
    movq  [%9+0x10], m1
125
    SBUTTERFLY bw, m6, %8, m5
126
    SBUTTERFLY wd, m0, m2, m1
127
    SBUTTERFLY wd, m4, m6, m2
128
    punpckhdq m0, m4
129
    movq  [%9+0x00], m0
130
    SBUTTERFLY wd, m7, [%9+0x10], m6
131
    SBUTTERFLY wd, m3, m5, m4
132
    SBUTTERFLY dq, m7, m3, m0
133
    SBUTTERFLY dq, m1, m2, m5
134
    punpckldq m6, m4
135
    movq  [%9+0x10], m1
136
    movq  [%9+0x20], m5
137
    movq  [%9+0x30], m7
138
    movq  [%9+0x40], m0
139
    movq  [%9+0x50], m6
140
%endmacro
141

    
142
; in: 8 rows of 8 in %1..%8
143
; out: 8 rows of 8 in %9..%16
144
%macro TRANSPOSE8x8_MEM 16
145
    movq  m0, %1
146
    movq  m1, %2
147
    movq  m2, %3
148
    movq  m3, %4
149
    movq  m4, %5
150
    movq  m5, %6
151
    movq  m6, %7
152
    SBUTTERFLY bw, m0, m1, m7
153
    SBUTTERFLY bw, m2, m3, m1
154
    SBUTTERFLY bw, m4, m5, m3
155
    SBUTTERFLY bw, m6, %8, m5
156
    movq  %9,  m3
157
    SBUTTERFLY wd, m0, m2, m3
158
    SBUTTERFLY wd, m4, m6, m2
159
    SBUTTERFLY wd, m7, m1, m6
160
    movq  %11, m2
161
    movq  m2,  %9
162
    SBUTTERFLY wd, m2, m5, m1
163
    SBUTTERFLY dq, m0, m4, m5
164
    SBUTTERFLY dq, m7, m2, m4
165
    movq  %9,  m0
166
    movq  %10, m5
167
    movq  %13, m7
168
    movq  %14, m4
169
    SBUTTERFLY dq, m3, %11, m0
170
    SBUTTERFLY dq, m6, m1, m5
171
    movq  %11, m3
172
    movq  %12, m0
173
    movq  %15, m6
174
    movq  %16, m5
175
%endmacro
176

    
177
; out: %4 = |%1-%2|>%3
178
; clobbers: %5
179
%macro DIFF_GT 5
180
    mova    %5, %2
181
    mova    %4, %1
182
    psubusb %5, %1
183
    psubusb %4, %2
184
    por     %4, %5
185
    psubusb %4, %3
186
%endmacro
187

    
188
; out: %4 = |%1-%2|>%3
189
; clobbers: %5
190
%macro DIFF_GT2 5
191
    mova    %5, %2
192
    mova    %4, %1
193
    psubusb %5, %1
194
    psubusb %4, %2
195
    psubusb %5, %3
196
    psubusb %4, %3
197
    pcmpeqb %4, %5
198
%endmacro
199

    
200
%macro SPLATW 1
201
%ifidn m0, xmm0
202
    pshuflw  %1, %1, 0
203
    punpcklqdq %1, %1
204
%else
205
    pshufw   %1, %1, 0
206
%endif
207
%endmacro
208

    
209
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210
; out: m5=beta-1, m7=mask, %3=alpha-1
211
; clobbers: m4,m6
212
%macro LOAD_MASK 2-3
213
    movd     m4, %1
214
    movd     m5, %2
215
    SPLATW   m4
216
    SPLATW   m5
217
    packuswb m4, m4  ; 16x alpha-1
218
    packuswb m5, m5  ; 16x beta-1
219
%if %0>2
220
    mova     %3, m4
221
%endif
222
    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223
    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
224
    por      m7, m4
225
    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
226
    por      m7, m4
227
    pxor     m6, m6
228
    pcmpeqb  m7, m6
229
%endmacro
230

    
231
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
232
; out: m1=p0' m2=q0'
233
; clobbers: m0,3-6
234
%macro DEBLOCK_P0_Q0 0
235
    mova    m5, m1
236
    pxor    m5, m2           ; p0^q0
237
    pand    m5, [pb_01 GLOBAL] ; (p0^q0)&1
238
    pcmpeqb m4, m4
239
    pxor    m3, m4
240
    pavgb   m3, m0           ; (p1 - q1 + 256)>>1
241
    pavgb   m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
242
    pxor    m4, m1
243
    pavgb   m4, m2           ; (q0 - p0 + 256)>>1
244
    pavgb   m3, m5
245
    paddusb m3, m4           ; d+128+33
246
    mova    m6, [pb_a1 GLOBAL]
247
    psubusb m6, m3
248
    psubusb m3, [pb_a1 GLOBAL]
249
    pminub  m6, m7
250
    pminub  m3, m7
251
    psubusb m1, m6
252
    psubusb m2, m3
253
    paddusb m1, m3
254
    paddusb m2, m6
255
%endmacro
256

    
257
; in: m1=p0 m2=q0
258
;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260
; clobbers: q2, tmp, tc0
261
%macro LUMA_Q1 6
262
    mova    %6, m1
263
    pavgb   %6, m2
264
    pavgb   %2, %6             ; avg(p2,avg(p0,q0))
265
    pxor    %6, %3
266
    pand    %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267
    psubusb %2, %6             ; (p2+((p0+q0+1)>>1))>>1
268
    mova    %6, %1
269
    psubusb %6, %5
270
    paddusb %5, %1
271
    pmaxub  %2, %6
272
    pminub  %2, %5
273
    mova    %4, %2
274
%endmacro
275

    
276
%ifdef ARCH_X86_64
277
;-----------------------------------------------------------------------------
278
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279
;-----------------------------------------------------------------------------
280
INIT_XMM
281
cglobal x264_deblock_v_luma_sse2
282
    movd    m8, [r4] ; tc0
283
    lea     r4, [r1*3]
284
    dec     r2d        ; alpha-1
285
    neg     r4
286
    dec     r3d        ; beta-1
287
    add     r4, r0     ; pix-3*stride
288

    
289
    mova    m0, [r4+r1]   ; p1
290
    mova    m1, [r4+2*r1] ; p0
291
    mova    m2, [r0]      ; q0
292
    mova    m3, [r0+r1]   ; q1
293
    LOAD_MASK r2d, r3d
294

    
295
    punpcklbw m8, m8
296
    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
297
    pcmpeqb m9, m9
298
    pcmpeqb m9, m8
299
    pandn   m9, m7
300
    pand    m8, m9
301

    
302
    movdqa  m3, [r4] ; p2
303
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
304
    pand    m6, m9
305
    mova    m7, m8
306
    psubb   m7, m6
307
    pand    m6, m8
308
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
309

    
310
    movdqa  m4, [r0+2*r1] ; q2
311
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
312
    pand    m6, m9
313
    pand    m8, m6
314
    psubb   m7, m6
315
    mova    m3, [r0+r1]
316
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
317

    
318
    DEBLOCK_P0_Q0
319
    mova    [r4+2*r1], m1
320
    mova    [r0], m2
321
    ret
322

    
323
;-----------------------------------------------------------------------------
324
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325
;-----------------------------------------------------------------------------
326
INIT_MMX
327
cglobal x264_deblock_h_luma_sse2
328
    movsxd r10, esi
329
    lea    r11, [r10+r10*2]
330
    lea    rax, [r0-4]
331
    lea    r9,  [r0-4+r11]
332
    sub    rsp, 0x68
333
    %define pix_tmp rsp
334

    
335
    ; transpose 6x16 -> tmp space
336
    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp
337
    lea    rax, [rax+r10*8]
338
    lea    r9,  [r9 +r10*8]
339
    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
340

    
341
    ; vertical filter
342
    ; alpha, beta, tc0 are still in r2d, r3d, r4
343
    ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344
    lea    r0, [pix_tmp+0x30]
345
    mov    esi, 0x10
346
    call   x264_deblock_v_luma_sse2
347

    
348
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
349
    add    rax, 2
350
    add    r9,  2
351
    movq   m0, [pix_tmp+0x18]
352
    movq   m1, [pix_tmp+0x28]
353
    movq   m2, [pix_tmp+0x38]
354
    movq   m3, [pix_tmp+0x48]
355
    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
356

    
357
    shl    r10, 3
358
    sub    rax, r10
359
    sub    r9,  r10
360
    shr    r10, 3
361
    movq   m0, [pix_tmp+0x10]
362
    movq   m1, [pix_tmp+0x20]
363
    movq   m2, [pix_tmp+0x30]
364
    movq   m3, [pix_tmp+0x40]
365
    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
366

    
367
    add    rsp, 0x68
368
    ret
369

    
370
%else
371

    
372
%macro DEBLOCK_LUMA 3
373
;-----------------------------------------------------------------------------
374
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
375
;-----------------------------------------------------------------------------
376
cglobal x264_deblock_%2_luma_%1, 5,5
377
    lea     r4, [r1*3]
378
    dec     r2     ; alpha-1
379
    neg     r4
380
    dec     r3     ; beta-1
381
    add     r4, r0 ; pix-3*stride
382
    %assign pad 2*%3+12-(stack_offset&15)
383
    SUB     esp, pad
384

    
385
    mova    m0, [r4+r1]   ; p1
386
    mova    m1, [r4+2*r1] ; p0
387
    mova    m2, [r0]      ; q0
388
    mova    m3, [r0+r1]   ; q1
389
    LOAD_MASK r2, r3
390

    
391
    mov     r3, r4m
392
    movd    m4, [r3] ; tc0
393
    punpcklbw m4, m4
394
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
395
    mova   [esp+%3], m4 ; tc
396
    pcmpeqb m3, m3
397
    pcmpgtb m4, m3
398
    pand    m4, m7
399
    mova   [esp], m4 ; mask
400

    
401
    mova    m3, [r4] ; p2
402
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
403
    pand    m6, m4
404
    pand    m4, [esp+%3] ; tc
405
    mova    m7, m4
406
    psubb   m7, m6
407
    pand    m6, m4
408
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
409

    
410
    mova    m4, [r0+2*r1] ; q2
411
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
412
    mova    m5, [esp] ; mask
413
    pand    m6, m5
414
    mova    m5, [esp+%3] ; tc
415
    pand    m5, m6
416
    psubb   m7, m6
417
    mova    m3, [r0+r1]
418
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
419

    
420
    DEBLOCK_P0_Q0
421
    mova    [r4+2*r1], m1
422
    mova    [r0], m2
423
    ADD     esp, pad
424
    RET
425

    
426
;-----------------------------------------------------------------------------
427
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
428
;-----------------------------------------------------------------------------
429
INIT_MMX
430
cglobal x264_deblock_h_luma_%1, 0,5
431
    mov    r0, r0m
432
    mov    r3, r1m
433
    lea    r4, [r3*3]
434
    sub    r0, 4
435
    lea    r1, [r0+r4]
436
    %assign pad 0x78-(stack_offset&15)
437
    SUB    esp, pad
438
%define pix_tmp esp+12
439

    
440
    ; transpose 6x16 -> tmp space
441
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
442
    lea    r0, [r0+r3*8]
443
    lea    r1, [r1+r3*8]
444
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
445

    
446
    ; vertical filter
447
    lea    r0, [pix_tmp+0x30]
448
    PUSH   dword r4m
449
    PUSH   dword r3m
450
    PUSH   dword r2m
451
    PUSH   dword 16
452
    PUSH   dword r0
453
    call   x264_deblock_%2_luma_%1
454
%ifidn %2, v8
455
    add    dword [esp   ], 8 ; pix_tmp+0x38
456
    add    dword [esp+16], 2 ; tc0+2
457
    call   x264_deblock_%2_luma_%1
458
%endif
459
    ADD    esp, 20
460

    
461
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
462
    mov    r0, r0m
463
    sub    r0, 2
464
    lea    r1, [r0+r4]
465

    
466
    movq   m0, [pix_tmp+0x10]
467
    movq   m1, [pix_tmp+0x20]
468
    movq   m2, [pix_tmp+0x30]
469
    movq   m3, [pix_tmp+0x40]
470
    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
471

    
472
    lea    r0, [r0+r3*8]
473
    lea    r1, [r1+r3*8]
474
    movq   m0, [pix_tmp+0x18]
475
    movq   m1, [pix_tmp+0x28]
476
    movq   m2, [pix_tmp+0x38]
477
    movq   m3, [pix_tmp+0x48]
478
    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
479

    
480
    ADD    esp, pad
481
    RET
482
%endmacro ; DEBLOCK_LUMA
483

    
484
INIT_XMM
485
DEBLOCK_LUMA sse2, v, 16
486

    
487
%endif ; ARCH
488

    
489

    
490

    
491
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
492
    mova  t0, p2
493
    mova  t1, p0
494
    pavgb t0, p1
495
    pavgb t1, q0
496
    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
497
    mova  t5, t1
498
    mova  t2, p2
499
    mova  t3, p0
500
    paddb t2, p1
501
    paddb t3, q0
502
    paddb t2, t3
503
    mova  t3, t2
504
    mova  t4, t2
505
    psrlw t2, 1
506
    pavgb t2, mpb_00
507
    pxor  t2, t0
508
    pand  t2, mpb_01
509
    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
510

    
511
    mova  t1, p2
512
    mova  t2, p2
513
    pavgb t1, q1
514
    psubb t2, q1
515
    paddb t3, t3
516
    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
517
    pand  t2, mpb_01
518
    psubb t1, t2
519
    pavgb t1, p1
520
    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
521
    psrlw t3, 2
522
    pavgb t3, mpb_00
523
    pxor  t3, t1
524
    pand  t3, mpb_01
525
    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
526

    
527
    mova  t3, p0
528
    mova  t2, p0
529
    pxor  t3, q1
530
    pavgb t2, q1
531
    pand  t3, mpb_01
532
    psubb t2, t3
533
    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
534

    
535
    pxor  t1, t2
536
    pxor  t2, p0
537
    pand  t1, mask1p
538
    pand  t2, mask0
539
    pxor  t1, t2
540
    pxor  t1, p0
541
    mova  %1, t1 ; store p0
542

    
543
    mova  t1, %4 ; p3
544
    mova  t2, t1
545
    pavgb t1, p2
546
    paddb t2, p2
547
    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
548
    paddb t2, t2
549
    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
550
    psrlw t2, 2
551
    pavgb t2, mpb_00
552
    pxor  t2, t1
553
    pand  t2, mpb_01
554
    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
555

    
556
    pxor  t0, p1
557
    pxor  t1, p2
558
    pand  t0, mask1p
559
    pand  t1, mask1p
560
    pxor  t0, p1
561
    pxor  t1, p2
562
    mova  %2, t0 ; store p1
563
    mova  %3, t1 ; store p2
564
%endmacro
565

    
566
%macro LUMA_INTRA_SWAP_PQ 0
567
    %define q1 m0
568
    %define q0 m1
569
    %define p0 m2
570
    %define p1 m3
571
    %define p2 q2
572
    %define mask1p mask1q
573
%endmacro
574

    
575
%macro DEBLOCK_LUMA_INTRA 2
576
    %define p1 m0
577
    %define p0 m1
578
    %define q0 m2
579
    %define q1 m3
580
    %define t0 m4
581
    %define t1 m5
582
    %define t2 m6
583
    %define t3 m7
584
%ifdef ARCH_X86_64
585
    %define p2 m8
586
    %define q2 m9
587
    %define t4 m10
588
    %define t5 m11
589
    %define mask0 m12
590
    %define mask1p m13
591
    %define mask1q [rsp-24]
592
    %define mpb_00 m14
593
    %define mpb_01 m15
594
%else
595
    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
596
    %define p2 [r4+r1]
597
    %define q2 [r0+2*r1]
598
    %define t4 spill(0)
599
    %define t5 spill(1)
600
    %define mask0 spill(2)
601
    %define mask1p spill(3)
602
    %define mask1q spill(4)
603
    %define mpb_00 [pb_00 GLOBAL]
604
    %define mpb_01 [pb_01 GLOBAL]
605
%endif
606

    
607
;-----------------------------------------------------------------------------
608
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609
;-----------------------------------------------------------------------------
610
cglobal x264_deblock_%2_luma_intra_%1, 4,6
611
%ifndef ARCH_X86_64
612
    sub     esp, 0x60
613
%endif
614
    lea     r4, [r1*4]
615
    lea     r5, [r1*3] ; 3*stride
616
    dec     r2d        ; alpha-1
617
    jl .end
618
    neg     r4
619
    dec     r3d        ; beta-1
620
    jl .end
621
    add     r4, r0     ; pix-4*stride
622
    mova    p1, [r4+2*r1]
623
    mova    p0, [r4+r5]
624
    mova    q0, [r0]
625
    mova    q1, [r0+r1]
626
%ifdef ARCH_X86_64
627
    pxor    mpb_00, mpb_00
628
    mova    mpb_01, [pb_01 GLOBAL]
629
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
630
    SWAP    7, 12 ; m12=mask0
631
    pavgb   t5, mpb_00
632
    pavgb   t5, mpb_01 ; alpha/4+1
633
    movdqa  p2, [r4+r1]
634
    movdqa  q2, [r0+2*r1]
635
    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
636
    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
637
    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
638
    pand    t0, mask0
639
    pand    t4, t0
640
    pand    t2, t0
641
    mova    mask1q, t4
642
    mova    mask1p, t2
643
%else
644
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
645
    mova    m4, t5
646
    mova    mask0, m7
647
    pavgb   m4, [pb_00 GLOBAL]
648
    pavgb   m4, [pb_01 GLOBAL] ; alpha/4+1
649
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
650
    pand    m6, mask0
651
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
652
    pand    m4, m6
653
    mova    mask1p, m4
654
    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
655
    pand    m4, m6
656
    mova    mask1q, m4
657
%endif
658
    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
659
    LUMA_INTRA_SWAP_PQ
660
    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
661
.end:
662
%ifndef ARCH_X86_64
663
    add     esp, 0x60
664
%endif
665
    RET
666

    
667
INIT_MMX
668
%ifdef ARCH_X86_64
669
;-----------------------------------------------------------------------------
670
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671
;-----------------------------------------------------------------------------
672
cglobal x264_deblock_h_luma_intra_%1
673
    movsxd r10, r1d
674
    lea    r11, [r10*3]
675
    lea    rax, [r0-4]
676
    lea    r9,  [r0-4+r11]
677
    sub    rsp, 0x88
678
    %define pix_tmp rsp
679

    
680
    ; transpose 8x16 -> tmp space
681
    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
682
    lea    rax, [rax+r10*8]
683
    lea    r9,  [r9+r10*8]
684
    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
685

    
686
    lea    r0,  [pix_tmp+0x40]
687
    mov    r1,  0x10
688
    call   x264_deblock_v_luma_intra_%1
689

    
690
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
691
    lea    r9, [rax+r11]
692
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
693
    shl    r10, 3
694
    sub    rax, r10
695
    sub    r9,  r10
696
    shr    r10, 3
697
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
698
    add    rsp, 0x88
699
    ret
700
%else
701
cglobal x264_deblock_h_luma_intra_%1, 2,4
702
    lea    r3,  [r1*3]
703
    sub    r0,  4
704
    lea    r2,  [r0+r3]
705
%assign pad 0x8c-(stack_offset&15)
706
    SUB    rsp, pad
707
    %define pix_tmp rsp
708

    
709
    ; transpose 8x16 -> tmp space
710
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
711
    lea    r0,  [r0+r1*8]
712
    lea    r2,  [r2+r1*8]
713
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
714

    
715
    lea    r0,  [pix_tmp+0x40]
716
    PUSH   dword r3m
717
    PUSH   dword r2m
718
    PUSH   dword 16
719
    PUSH   r0
720
    call   x264_deblock_%2_luma_intra_%1
721
%ifidn %2, v8
722
    add    dword [rsp], 8 ; pix_tmp+8
723
    call   x264_deblock_%2_luma_intra_%1
724
%endif
725
    ADD    esp, 16
726

    
727
    mov    r1,  r1m
728
    mov    r0,  r0m
729
    lea    r3,  [r1*3]
730
    sub    r0,  4
731
    lea    r2,  [r0+r3]
732
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
733
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
734
    lea    r0,  [r0+r1*8]
735
    lea    r2,  [r2+r1*8]
736
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
737
    ADD    rsp, pad
738
    RET
739
%endif ; ARCH_X86_64
740
%endmacro ; DEBLOCK_LUMA_INTRA
741

    
742
INIT_XMM
743
DEBLOCK_LUMA_INTRA sse2, v
744
%ifndef ARCH_X86_64
745
INIT_MMX
746
DEBLOCK_LUMA_INTRA mmxext, v8
747
%endif