Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_deblock_sse2.asm @ 3f87f39c

History | View | Annotate | Download (19.3 KB)

1
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2008 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*
8
;* This program is free software; you can redistribute it and/or modify
9
;* it under the terms of the GNU General Public License as published by
10
;* the Free Software Foundation; either version 2 of the License, or
11
;* (at your option) any later version.
12
;*
13
;* This program is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
;* GNU General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU General Public License
19
;* along with this program; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
;*****************************************************************************
22

    
23
%include "x86inc.asm"
24

    
25
SECTION_RODATA
26
pb_00: times 16 db 0x00
27
pb_01: times 16 db 0x01
28
pb_03: times 16 db 0x03
29
pb_a1: times 16 db 0xa1
30

    
31
SECTION .text
32

    
33
; expands to [base],...,[base+7*stride]
34
%define PASS8ROWS(base, base3, stride, stride3) \
35
    [base], [base+stride], [base+stride*2], [base3], \
36
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
37

    
38
; in: 8 rows of 4 bytes in %1..%8
39
; out: 4 rows of 8 bytes in m0..m3
40
%macro TRANSPOSE4x8_LOAD 8
41
    movd       m0, %1
42
    movd       m2, %2
43
    movd       m1, %3
44
    movd       m3, %4
45
    punpcklbw  m0, m2
46
    punpcklbw  m1, m3
47
    movq       m2, m0
48
    punpcklwd  m0, m1
49
    punpckhwd  m2, m1
50

    
51
    movd       m4, %5
52
    movd       m6, %6
53
    movd       m5, %7
54
    movd       m7, %8
55
    punpcklbw  m4, m6
56
    punpcklbw  m5, m7
57
    movq       m6, m4
58
    punpcklwd  m4, m5
59
    punpckhwd  m6, m5
60

    
61
    movq       m1, m0
62
    movq       m3, m2
63
    punpckldq  m0, m4
64
    punpckhdq  m1, m4
65
    punpckldq  m2, m6
66
    punpckhdq  m3, m6
67
%endmacro
68

    
69
; in: 4 rows of 8 bytes in m0..m3
70
; out: 8 rows of 4 bytes in %1..%8
71
%macro TRANSPOSE8x4_STORE 8
72
    movq       m4, m0
73
    movq       m5, m1
74
    movq       m6, m2
75
    punpckhdq  m4, m4
76
    punpckhdq  m5, m5
77
    punpckhdq  m6, m6
78

    
79
    punpcklbw  m0, m1
80
    punpcklbw  m2, m3
81
    movq       m1, m0
82
    punpcklwd  m0, m2
83
    punpckhwd  m1, m2
84
    movd       %1, m0
85
    punpckhdq  m0, m0
86
    movd       %2, m0
87
    movd       %3, m1
88
    punpckhdq  m1, m1
89
    movd       %4, m1
90

    
91
    punpckhdq  m3, m3
92
    punpcklbw  m4, m5
93
    punpcklbw  m6, m3
94
    movq       m5, m4
95
    punpcklwd  m4, m6
96
    punpckhwd  m5, m6
97
    movd       %5, m4
98
    punpckhdq  m4, m4
99
    movd       %6, m4
100
    movd       %7, m5
101
    punpckhdq  m5, m5
102
    movd       %8, m5
103
%endmacro
104

    
105
%macro SBUTTERFLY 4
106
    movq       %4, %2
107
    punpckl%1  %2, %3
108
    punpckh%1  %4, %3
109
%endmacro
110

    
111
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113
%macro TRANSPOSE6x8_MEM 9
114
    movq  m0, %1
115
    movq  m1, %2
116
    movq  m2, %3
117
    movq  m3, %4
118
    movq  m4, %5
119
    movq  m5, %6
120
    movq  m6, %7
121
    SBUTTERFLY bw, m0, m1, m7
122
    SBUTTERFLY bw, m2, m3, m1
123
    SBUTTERFLY bw, m4, m5, m3
124
    movq  [%9+0x10], m1
125
    SBUTTERFLY bw, m6, %8, m5
126
    SBUTTERFLY wd, m0, m2, m1
127
    SBUTTERFLY wd, m4, m6, m2
128
    punpckhdq m0, m4
129
    movq  [%9+0x00], m0
130
    SBUTTERFLY wd, m7, [%9+0x10], m6
131
    SBUTTERFLY wd, m3, m5, m4
132
    SBUTTERFLY dq, m7, m3, m0
133
    SBUTTERFLY dq, m1, m2, m5
134
    punpckldq m6, m4
135
    movq  [%9+0x10], m1
136
    movq  [%9+0x20], m5
137
    movq  [%9+0x30], m7
138
    movq  [%9+0x40], m0
139
    movq  [%9+0x50], m6
140
%endmacro
141

    
142
; in: 8 rows of 8 in %1..%8
143
; out: 8 rows of 8 in %9..%16
144
%macro TRANSPOSE8x8_MEM 16
145
    movq  m0, %1
146
    movq  m1, %2
147
    movq  m2, %3
148
    movq  m3, %4
149
    movq  m4, %5
150
    movq  m5, %6
151
    movq  m6, %7
152
    SBUTTERFLY bw, m0, m1, m7
153
    SBUTTERFLY bw, m2, m3, m1
154
    SBUTTERFLY bw, m4, m5, m3
155
    SBUTTERFLY bw, m6, %8, m5
156
    movq  %9,  m3
157
    SBUTTERFLY wd, m0, m2, m3
158
    SBUTTERFLY wd, m4, m6, m2
159
    SBUTTERFLY wd, m7, m1, m6
160
    movq  %11, m2
161
    movq  m2,  %9
162
    SBUTTERFLY wd, m2, m5, m1
163
    SBUTTERFLY dq, m0, m4, m5
164
    SBUTTERFLY dq, m7, m2, m4
165
    movq  %9,  m0
166
    movq  %10, m5
167
    movq  %13, m7
168
    movq  %14, m4
169
    SBUTTERFLY dq, m3, %11, m0
170
    SBUTTERFLY dq, m6, m1, m5
171
    movq  %11, m3
172
    movq  %12, m0
173
    movq  %15, m6
174
    movq  %16, m5
175
%endmacro
176

    
177
; out: %4 = |%1-%2|>%3
178
; clobbers: %5
179
%macro DIFF_GT 5
180
    mova    %5, %2
181
    mova    %4, %1
182
    psubusb %5, %1
183
    psubusb %4, %2
184
    por     %4, %5
185
    psubusb %4, %3
186
%endmacro
187

    
188
; out: %4 = |%1-%2|>%3
189
; clobbers: %5
190
%macro DIFF_GT2 5
191
    mova    %5, %2
192
    mova    %4, %1
193
    psubusb %5, %1
194
    psubusb %4, %2
195
    psubusb %5, %3
196
    psubusb %4, %3
197
    pcmpeqb %4, %5
198
%endmacro
199

    
200
%macro SPLATW 1
201
%ifidn m0, xmm0
202
    pshuflw  %1, %1, 0
203
    punpcklqdq %1, %1
204
%else
205
    pshufw   %1, %1, 0
206
%endif
207
%endmacro
208

    
209
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210
; out: m5=beta-1, m7=mask, %3=alpha-1
211
; clobbers: m4,m6
212
%macro LOAD_MASK 2-3
213
    movd     m4, %1
214
    movd     m5, %2
215
    SPLATW   m4
216
    SPLATW   m5
217
    packuswb m4, m4  ; 16x alpha-1
218
    packuswb m5, m5  ; 16x beta-1
219
%if %0>2
220
    mova     %3, m4
221
%endif
222
    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223
    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
224
    por      m7, m4
225
    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
226
    por      m7, m4
227
    pxor     m6, m6
228
    pcmpeqb  m7, m6
229
%endmacro
230

    
231
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
232
; out: m1=p0' m2=q0'
233
; clobbers: m0,3-6
234
%macro DEBLOCK_P0_Q0 0
235
    mova    m5, m1
236
    pxor    m5, m2           ; p0^q0
237
    pand    m5, [pb_01 GLOBAL] ; (p0^q0)&1
238
    pcmpeqb m4, m4
239
    pxor    m3, m4
240
    pavgb   m3, m0           ; (p1 - q1 + 256)>>1
241
    pavgb   m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
242
    pxor    m4, m1
243
    pavgb   m4, m2           ; (q0 - p0 + 256)>>1
244
    pavgb   m3, m5
245
    paddusb m3, m4           ; d+128+33
246
    mova    m6, [pb_a1 GLOBAL]
247
    psubusb m6, m3
248
    psubusb m3, [pb_a1 GLOBAL]
249
    pminub  m6, m7
250
    pminub  m3, m7
251
    psubusb m1, m6
252
    psubusb m2, m3
253
    paddusb m1, m3
254
    paddusb m2, m6
255
%endmacro
256

    
257
; in: m1=p0 m2=q0
258
;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260
; clobbers: q2, tmp, tc0
261
%macro LUMA_Q1 6
262
    mova    %6, m1
263
    pavgb   %6, m2
264
    pavgb   %2, %6             ; avg(p2,avg(p0,q0))
265
    pxor    %6, %3
266
    pand    %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267
    psubusb %2, %6             ; (p2+((p0+q0+1)>>1))>>1
268
    mova    %6, %1
269
    psubusb %6, %5
270
    paddusb %5, %1
271
    pmaxub  %2, %6
272
    pminub  %2, %5
273
    mova    %4, %2
274
%endmacro
275

    
276
%ifdef ARCH_X86_64
277
;-----------------------------------------------------------------------------
278
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279
;-----------------------------------------------------------------------------
280
INIT_XMM
281
cglobal x264_deblock_v_luma_sse2, 5,5,10
282
    movd    m8, [r4] ; tc0
283
    lea     r4, [r1*3]
284
    dec     r2d        ; alpha-1
285
    neg     r4
286
    dec     r3d        ; beta-1
287
    add     r4, r0     ; pix-3*stride
288

    
289
    mova    m0, [r4+r1]   ; p1
290
    mova    m1, [r4+2*r1] ; p0
291
    mova    m2, [r0]      ; q0
292
    mova    m3, [r0+r1]   ; q1
293
    LOAD_MASK r2d, r3d
294

    
295
    punpcklbw m8, m8
296
    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
297
    pcmpeqb m9, m9
298
    pcmpeqb m9, m8
299
    pandn   m9, m7
300
    pand    m8, m9
301

    
302
    movdqa  m3, [r4] ; p2
303
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
304
    pand    m6, m9
305
    mova    m7, m8
306
    psubb   m7, m6
307
    pand    m6, m8
308
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
309

    
310
    movdqa  m4, [r0+2*r1] ; q2
311
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
312
    pand    m6, m9
313
    pand    m8, m6
314
    psubb   m7, m6
315
    mova    m3, [r0+r1]
316
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
317

    
318
    DEBLOCK_P0_Q0
319
    mova    [r4+2*r1], m1
320
    mova    [r0], m2
321
    RET
322

    
323
;-----------------------------------------------------------------------------
324
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325
;-----------------------------------------------------------------------------
326
INIT_MMX
327
cglobal x264_deblock_h_luma_sse2, 5,7
328
    movsxd r10, r1d
329
    lea    r11, [r10+r10*2]
330
    lea    r6,  [r0-4]
331
    lea    r5,  [r0-4+r11]
332
%ifdef WIN64
333
    sub    rsp, 0x98
334
    %define pix_tmp rsp+0x30
335
%else
336
    sub    rsp, 0x68
337
    %define pix_tmp rsp
338
%endif
339

    
340
    ; transpose 6x16 -> tmp space
341
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
342
    lea    r6, [r6+r10*8]
343
    lea    r5, [r5+r10*8]
344
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
345

    
346
    ; vertical filter
347
    ; alpha, beta, tc0 are still in r2d, r3d, r4
348
    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
349
    lea    r0, [pix_tmp+0x30]
350
    mov    r1d, 0x10
351
%ifdef WIN64
352
    mov    [rsp+0x20], r4
353
%endif
354
    call   x264_deblock_v_luma_sse2
355

    
356
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
357
    add    r6, 2
358
    add    r5, 2
359
    movq   m0, [pix_tmp+0x18]
360
    movq   m1, [pix_tmp+0x28]
361
    movq   m2, [pix_tmp+0x38]
362
    movq   m3, [pix_tmp+0x48]
363
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
364

    
365
    shl    r10, 3
366
    sub    r6,  r10
367
    sub    r5,  r10
368
    shr    r10, 3
369
    movq   m0, [pix_tmp+0x10]
370
    movq   m1, [pix_tmp+0x20]
371
    movq   m2, [pix_tmp+0x30]
372
    movq   m3, [pix_tmp+0x40]
373
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
374

    
375
%ifdef WIN64
376
    add    rsp, 0x98
377
%else
378
    add    rsp, 0x68
379
%endif
380
    RET
381

    
382
%else
383

    
384
%macro DEBLOCK_LUMA 3
385
;-----------------------------------------------------------------------------
386
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
387
;-----------------------------------------------------------------------------
388
cglobal x264_deblock_%2_luma_%1, 5,5
389
    lea     r4, [r1*3]
390
    dec     r2     ; alpha-1
391
    neg     r4
392
    dec     r3     ; beta-1
393
    add     r4, r0 ; pix-3*stride
394
    %assign pad 2*%3+12-(stack_offset&15)
395
    SUB     esp, pad
396

    
397
    mova    m0, [r4+r1]   ; p1
398
    mova    m1, [r4+2*r1] ; p0
399
    mova    m2, [r0]      ; q0
400
    mova    m3, [r0+r1]   ; q1
401
    LOAD_MASK r2, r3
402

    
403
    mov     r3, r4mp
404
    movd    m4, [r3] ; tc0
405
    punpcklbw m4, m4
406
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
407
    mova   [esp+%3], m4 ; tc
408
    pcmpeqb m3, m3
409
    pcmpgtb m4, m3
410
    pand    m4, m7
411
    mova   [esp], m4 ; mask
412

    
413
    mova    m3, [r4] ; p2
414
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
415
    pand    m6, m4
416
    pand    m4, [esp+%3] ; tc
417
    mova    m7, m4
418
    psubb   m7, m6
419
    pand    m6, m4
420
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
421

    
422
    mova    m4, [r0+2*r1] ; q2
423
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
424
    mova    m5, [esp] ; mask
425
    pand    m6, m5
426
    mova    m5, [esp+%3] ; tc
427
    pand    m5, m6
428
    psubb   m7, m6
429
    mova    m3, [r0+r1]
430
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
431

    
432
    DEBLOCK_P0_Q0
433
    mova    [r4+2*r1], m1
434
    mova    [r0], m2
435
    ADD     esp, pad
436
    RET
437

    
438
;-----------------------------------------------------------------------------
439
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
440
;-----------------------------------------------------------------------------
441
INIT_MMX
442
cglobal x264_deblock_h_luma_%1, 0,5
443
    mov    r0, r0mp
444
    mov    r3, r1m
445
    lea    r4, [r3*3]
446
    sub    r0, 4
447
    lea    r1, [r0+r4]
448
    %assign pad 0x78-(stack_offset&15)
449
    SUB    esp, pad
450
%define pix_tmp esp+12
451

    
452
    ; transpose 6x16 -> tmp space
453
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
454
    lea    r0, [r0+r3*8]
455
    lea    r1, [r1+r3*8]
456
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
457

    
458
    ; vertical filter
459
    lea    r0, [pix_tmp+0x30]
460
    PUSH   dword r4m
461
    PUSH   dword r3m
462
    PUSH   dword r2m
463
    PUSH   dword 16
464
    PUSH   dword r0
465
    call   x264_deblock_%2_luma_%1
466
%ifidn %2, v8
467
    add    dword [esp   ], 8 ; pix_tmp+0x38
468
    add    dword [esp+16], 2 ; tc0+2
469
    call   x264_deblock_%2_luma_%1
470
%endif
471
    ADD    esp, 20
472

    
473
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
474
    mov    r0, r0mp
475
    sub    r0, 2
476
    lea    r1, [r0+r4]
477

    
478
    movq   m0, [pix_tmp+0x10]
479
    movq   m1, [pix_tmp+0x20]
480
    movq   m2, [pix_tmp+0x30]
481
    movq   m3, [pix_tmp+0x40]
482
    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
483

    
484
    lea    r0, [r0+r3*8]
485
    lea    r1, [r1+r3*8]
486
    movq   m0, [pix_tmp+0x18]
487
    movq   m1, [pix_tmp+0x28]
488
    movq   m2, [pix_tmp+0x38]
489
    movq   m3, [pix_tmp+0x48]
490
    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
491

    
492
    ADD    esp, pad
493
    RET
494
%endmacro ; DEBLOCK_LUMA
495

    
496
INIT_XMM
497
DEBLOCK_LUMA sse2, v, 16
498

    
499
%endif ; ARCH
500

    
501

    
502

    
503
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
504
    mova  t0, p2
505
    mova  t1, p0
506
    pavgb t0, p1
507
    pavgb t1, q0
508
    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
509
    mova  t5, t1
510
    mova  t2, p2
511
    mova  t3, p0
512
    paddb t2, p1
513
    paddb t3, q0
514
    paddb t2, t3
515
    mova  t3, t2
516
    mova  t4, t2
517
    psrlw t2, 1
518
    pavgb t2, mpb_00
519
    pxor  t2, t0
520
    pand  t2, mpb_01
521
    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
522

    
523
    mova  t1, p2
524
    mova  t2, p2
525
    pavgb t1, q1
526
    psubb t2, q1
527
    paddb t3, t3
528
    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
529
    pand  t2, mpb_01
530
    psubb t1, t2
531
    pavgb t1, p1
532
    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
533
    psrlw t3, 2
534
    pavgb t3, mpb_00
535
    pxor  t3, t1
536
    pand  t3, mpb_01
537
    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
538

    
539
    mova  t3, p0
540
    mova  t2, p0
541
    pxor  t3, q1
542
    pavgb t2, q1
543
    pand  t3, mpb_01
544
    psubb t2, t3
545
    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
546

    
547
    pxor  t1, t2
548
    pxor  t2, p0
549
    pand  t1, mask1p
550
    pand  t2, mask0
551
    pxor  t1, t2
552
    pxor  t1, p0
553
    mova  %1, t1 ; store p0
554

    
555
    mova  t1, %4 ; p3
556
    mova  t2, t1
557
    pavgb t1, p2
558
    paddb t2, p2
559
    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
560
    paddb t2, t2
561
    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
562
    psrlw t2, 2
563
    pavgb t2, mpb_00
564
    pxor  t2, t1
565
    pand  t2, mpb_01
566
    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
567

    
568
    pxor  t0, p1
569
    pxor  t1, p2
570
    pand  t0, mask1p
571
    pand  t1, mask1p
572
    pxor  t0, p1
573
    pxor  t1, p2
574
    mova  %2, t0 ; store p1
575
    mova  %3, t1 ; store p2
576
%endmacro
577

    
578
%macro LUMA_INTRA_SWAP_PQ 0
579
    %define q1 m0
580
    %define q0 m1
581
    %define p0 m2
582
    %define p1 m3
583
    %define p2 q2
584
    %define mask1p mask1q
585
%endmacro
586

    
587
%macro DEBLOCK_LUMA_INTRA 2
588
    %define p1 m0
589
    %define p0 m1
590
    %define q0 m2
591
    %define q1 m3
592
    %define t0 m4
593
    %define t1 m5
594
    %define t2 m6
595
    %define t3 m7
596
%ifdef ARCH_X86_64
597
    %define p2 m8
598
    %define q2 m9
599
    %define t4 m10
600
    %define t5 m11
601
    %define mask0 m12
602
    %define mask1p m13
603
    %define mask1q [rsp-24]
604
    %define mpb_00 m14
605
    %define mpb_01 m15
606
%else
607
    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
608
    %define p2 [r4+r1]
609
    %define q2 [r0+2*r1]
610
    %define t4 spill(0)
611
    %define t5 spill(1)
612
    %define mask0 spill(2)
613
    %define mask1p spill(3)
614
    %define mask1q spill(4)
615
    %define mpb_00 [pb_00 GLOBAL]
616
    %define mpb_01 [pb_01 GLOBAL]
617
%endif
618

    
619
;-----------------------------------------------------------------------------
620
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
621
;-----------------------------------------------------------------------------
622
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
623
%ifndef ARCH_X86_64
624
    sub     esp, 0x60
625
%endif
626
    lea     r4, [r1*4]
627
    lea     r5, [r1*3] ; 3*stride
628
    dec     r2d        ; alpha-1
629
    jl .end
630
    neg     r4
631
    dec     r3d        ; beta-1
632
    jl .end
633
    add     r4, r0     ; pix-4*stride
634
    mova    p1, [r4+2*r1]
635
    mova    p0, [r4+r5]
636
    mova    q0, [r0]
637
    mova    q1, [r0+r1]
638
%ifdef ARCH_X86_64
639
    pxor    mpb_00, mpb_00
640
    mova    mpb_01, [pb_01 GLOBAL]
641
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
642
    SWAP    7, 12 ; m12=mask0
643
    pavgb   t5, mpb_00
644
    pavgb   t5, mpb_01 ; alpha/4+1
645
    movdqa  p2, [r4+r1]
646
    movdqa  q2, [r0+2*r1]
647
    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
648
    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
649
    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
650
    pand    t0, mask0
651
    pand    t4, t0
652
    pand    t2, t0
653
    mova    mask1q, t4
654
    mova    mask1p, t2
655
%else
656
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
657
    mova    m4, t5
658
    mova    mask0, m7
659
    pavgb   m4, [pb_00 GLOBAL]
660
    pavgb   m4, [pb_01 GLOBAL] ; alpha/4+1
661
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
662
    pand    m6, mask0
663
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
664
    pand    m4, m6
665
    mova    mask1p, m4
666
    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
667
    pand    m4, m6
668
    mova    mask1q, m4
669
%endif
670
    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
671
    LUMA_INTRA_SWAP_PQ
672
    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
673
.end:
674
%ifndef ARCH_X86_64
675
    add     esp, 0x60
676
%endif
677
    RET
678

    
679
INIT_MMX
680
%ifdef ARCH_X86_64
681
;-----------------------------------------------------------------------------
682
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
683
;-----------------------------------------------------------------------------
684
cglobal x264_deblock_h_luma_intra_%1, 4,7
685
    movsxd r10, r1d
686
    lea    r11, [r10*3]
687
    lea    r6,  [r0-4]
688
    lea    r5,  [r0-4+r11]
689
    sub    rsp, 0x88
690
    %define pix_tmp rsp
691

    
692
    ; transpose 8x16 -> tmp space
693
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
694
    lea    r6, [r6+r10*8]
695
    lea    r5, [r5+r10*8]
696
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
697

    
698
    lea    r0,  [pix_tmp+0x40]
699
    mov    r1,  0x10
700
    call   x264_deblock_v_luma_intra_%1
701

    
702
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
703
    lea    r5, [r6+r11]
704
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
705
    shl    r10, 3
706
    sub    r6,  r10
707
    sub    r5,  r10
708
    shr    r10, 3
709
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
710
    add    rsp, 0x88
711
    RET
712
%else
713
cglobal x264_deblock_h_luma_intra_%1, 2,4
714
    lea    r3,  [r1*3]
715
    sub    r0,  4
716
    lea    r2,  [r0+r3]
717
%assign pad 0x8c-(stack_offset&15)
718
    SUB    rsp, pad
719
    %define pix_tmp rsp
720

    
721
    ; transpose 8x16 -> tmp space
722
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
723
    lea    r0,  [r0+r1*8]
724
    lea    r2,  [r2+r1*8]
725
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
726

    
727
    lea    r0,  [pix_tmp+0x40]
728
    PUSH   dword r3m
729
    PUSH   dword r2m
730
    PUSH   dword 16
731
    PUSH   r0
732
    call   x264_deblock_%2_luma_intra_%1
733
%ifidn %2, v8
734
    add    dword [rsp], 8 ; pix_tmp+8
735
    call   x264_deblock_%2_luma_intra_%1
736
%endif
737
    ADD    esp, 16
738

    
739
    mov    r1,  r1m
740
    mov    r0,  r0mp
741
    lea    r3,  [r1*3]
742
    sub    r0,  4
743
    lea    r2,  [r0+r3]
744
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
745
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
746
    lea    r0,  [r0+r1*8]
747
    lea    r2,  [r2+r1*8]
748
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
749
    ADD    rsp, pad
750
    RET
751
%endif ; ARCH_X86_64
752
%endmacro ; DEBLOCK_LUMA_INTRA
753

    
754
INIT_XMM
755
DEBLOCK_LUMA_INTRA sse2, v
756
%ifndef ARCH_X86_64
757
INIT_MMX
758
DEBLOCK_LUMA_INTRA mmxext, v8
759
%endif