/* Colour conversion routines (RGB <-> YUV) in x86 assembly, with viewport extension. (C) 2001 Nemosoft Unv. nemosoft@smcc.demon.nl This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* The vcvt_* functions always start with width and height and plus, so these parameters are in 8(%ebp), 12(%ebp) and 16(%ebp). The other parameters can be 2 to 4 pointers, in one of these combinations: *src, *dst *srcy, *srcu, *srv, *dst *src, *dsty, *dstu, *dstv */ #define __ASSEMBLY__ #include .line 35 #define Width 8(%ebp) #define Height 12(%ebp) #define Plus 16(%ebp) /* 2 parameters, 1 in, 1 out */ #define Src2 20(%ebp) #define Dst2 24(%ebp) /* 4 parameters, 3 in, 1 out */ #define SrcY 20(%ebp) #define SrcU 24(%ebp) #define SrcV 28(%ebp) #define Dst4 32(%ebp) /* 4 parameters, 1 in, 3 out */ #define Src4 20(%ebp) #define DstY 24(%ebp) #define DstU 28(%ebp) #define DstV 32(%ebp) /* This buffer space used to be staticly allocted, but this is going to give problems with multiple cams (though I have yet to see it). Therefor, we reserve least 64 bytes (16 * 4) bytes on the stack, plus some space for extra variables. */ #define PixelBuffer -64(%ebp) #define Uptr -68(%ebp) #define Vptr -72(%ebp) #define DstPlus -76(%ebp) #define StackSpace $76 .text /* This function will load the src and destination pointers, including Uptr/Vptr when necessary, and test the width/height parameters. - %esi will be set to Src or SrcY - %edi will be set to Dst or DstY the carry flag will be set if any of these tests fail. It assumes %ebp has been set. */ /* 2 parameters, src & dst */ test_param_2: mov Src2, %esi mov Dst2, %edi cmp $0, %esi # NULL pointers? je param_fail cmp $0, %edi je param_fail jmp test_width_height /* 3 inputs, 1 output */ test_param_31: mov Dst4, %edi # NULL pointers cmp $0, %edi je param_fail mov SrcV, %esi cmp $0, %esi je param_fail mov %esi, Vptr mov SrcU, %esi cmp $0, %esi je param_fail mov %esi, Uptr mov SrcY, %esi cmp $0, %esi je param_fail jmp test_width_height /* 1 input, 3 output */ test_param_13: mov Src4, %esi # NULL pointers cmp $0, %esi je param_fail mov DstV, %edi cmp $0, %edi je param_fail mov %edi, Vptr mov DstU, %edi cmp $0, %edi je param_fail mov %edi, Uptr mov DstY, %edi cmp $0, %edi je param_fail jmp test_width_height nop test_width_height: cmpl $0, Width jbe param_fail testl $3, Width # multiple of 4? jnz param_fail # Nope... cmp $0, Height # check illegal height jbe param_fail testl $1, Height # Odd no. of lines? jnz param_fail # Aye /* fall through */ /* exit points */ param_ok: clc # Success: clear carry ret param_fail: stc # Fail: set carry ret # This will fill PixelBuffer with 4 grey scale pixels (Y) # In: %eax = Value (Y3Y2Y1Y0) # Out: # Modifies: %ecx (-4) # Destroys: %edx expand_4_y: mov %eax, %edx # Keep in edx (we need eax) lea PixelBuffer, %edi 0: # This code is executed 4 times movzbl %dl, %eax # move, zero extending byte-to-long shl $8, %eax # 8 digit precision stosl # Expand into PixelBuffer stosl stosl add $4, %edi # Skip alpha shr $8, %edx # next Y dec %ecx test $3, %ecx jnz 0b ret # from expand_4_y # This will add the color factors to the (grey) values in PixelBuffer # In: %ebx (U1U0V1V0) # Out: # Modifies: # Destroys: %edi, %ebx, %eax, %edx expand_4_uv: lea PixelBuffer, %edi # reset pointer # V0 sub $128, %bl movsbl %bl, %eax mov $359, %edx # Vr mul %edx add %eax, 0x00(%edi) add %eax, 0x10(%edi) movsbl %bl, %eax mov $183, %edx # Vg mul %edx sub %eax, 0x04(%edi) sub %eax, 0x14(%edi) # V1 sub $128, %bh movsbl %bh, %eax mov $359, %edx # Vr mul %edx add %eax, 0x20(%edi) add %eax, 0x30(%edi) movsbl %bh, %eax mov $183, %edx # Vg mul %edx sub %eax, 0x24(%edi) sub %eax, 0x34(%edi) # U0 bswap %ebx # Get U values in lower half sub $128, %bh movsbl %bh, %eax mov $88, %edx # Ug mul %edx sub %eax, 0x04(%edi) sub %eax, 0x14(%edi) movsbl %bh, %eax mov $454, %edx # Ub mul %edx add %eax, 0x08(%edi) add %eax, 0x18(%edi) # U1 sub $128, %bl movsbl %bl, %eax mov $88, %edx # Ug mul %edx sub %eax, 0x24(%edi) sub %eax, 0x34(%edi) movsbl %bl, %eax mov $454, %edx # Ub mul %edx add %eax, 0x28(%edi) add %eax, 0x38(%edi) ret # expand_4_uv /* This function expands 4 420i pixels into PixelBuffer */ do_four_yuvi: push %edi lodsl # 4 bytes at a time call expand_4_y # now do UV values. on even lines, Y is followed by U values; on # odd lines V values follow. The U and V values are always pushed # on the stack in this order: # U V # First, calculate offset per line (1.5 * width) mov Width, %ebx # width shl %ebx # 2 * add Width, %ebx # 3 * shr %ebx # 1.5 * # even or odd lines testl $1, Height jz 2f # odd line; we are at V data, but do U data first neg %ebx # make ebx offset negative mov (%esi,%ebx),%ax # U push %ax lodsw # V push %ax jmp 3f 2: # even line lodsw # U push %ax sub $2, %ebx mov (%esi,%ebx), %ax # V push %ax 3: # Okay, so we now have the U and V values... expand into PixelBuffer pop %ebx call expand_4_uv pop %edi ret # from do_four_yuvi # Do four pixels, in planar format do_four_yuvp: push %edi # The first part is the same as for interlaced (4 bytes Y) lodsl # 4 bytes at a time call expand_4_y # now gather U and V values... mov Uptr, %ebx # Use Uptr/Vptr mov (%ebx), %ax push %ax add $2, %ebx mov %ebx, Uptr mov Vptr, %ebx mov (%ebx), %ax push %ax add $2, %ebx mov %ebx, Vptr pop %ebx call expand_4_uv pop %edi ret # Do four pixels, in yuyv interlaced format do_four_yuyv: push %edi lodsl # v0y1u0y0 mov %eax, %ebx bswap %ebx # y0u0y1v0 mov %bh, %ah # v0y1y1y0 and $0x00ff00ff, %ebx # __u0__v0 push %ax # y1y0 lodsl # v1y3u1y2 # mix register instructions mov %eax, %edx # so CPU pipeline doesnt stall rol $16, %eax # u1y2v1y3 mov %dl, %dh # v1y3y2y2 and $0xff00ff00, %eax # u1__v1__ mov $0, %dl # v1y3y2__ or %eax, %ebx # u1u0v1v0 shl $8, %edx # y3y2____ pop %dx # y3y2y1y0 mov %edx, %eax call expand_4_y call expand_4_uv pop %edi ret limit_pixels: # Limit all values in PixelBuffer push %esi push %edi push %ecx lea PixelBuffer, %esi mov %esi, %edi mov $16, %ecx 0: lodsl cmp $0, %eax # this would have been a perfect spot for CMOVxx instructions... jl 2f # except they only work on Pentium Pro processors, cmp $0xff00, %eax # and not even all of them jg 3f add $4, %edi # no use for stosl here loop 0b jmp 9f 2: mov $0, %eax stosl loop 0b jmp 9f 3: mov $0xff00, %eax stosl loop 0b jmp 9f 9: pop %ecx pop %edi pop %esi ret # from limit_pixels /* Copy RGB values from PixelBuffer into destination buffer, 4 bytes with alpha */ /* Push 3 pixel (12 bytes), in correct order */ push_rgb24: push %ecx push %esi lea PixelBuffer, %esi mov $4, %ecx 0: lodsl shr $8, %eax mov %al, (%edi) # Red lodsl shr $8, %eax mov %al, 1(%edi) # Green lodsl shr $8, %eax mov %al, 2(%edi) # Blue add $3, %edi lodsl # dummy loop 0b pop %esi pop %ecx ret /* Push 3 pixels (12 bytes), in wrong order */ push_bgr24: push %ecx push %esi lea PixelBuffer, %esi mov $4, %ecx 0: lodsl shr $8, %eax mov %al, 2(%edi) # Red lodsl shr $8, %eax mov %al, 1(%edi) # Green lodsl shr $8, %eax mov %al, (%edi) # Blue add $3, %edi lodsl # dummy loop 0b pop %esi pop %ecx ret /* The simplest format: push 4 bytes, RGBa */ push_rgb32: push %ecx push %esi mov $16, %ecx lea PixelBuffer, %esi 0: lodsl # red shr $8, %eax # 8 bit precision stosb loop 0b pop %esi pop %ecx ret /* Gosh. Would you believe it. They even made this format... (Qt 2.*) */ push_bgr32: # copy all 4 values to output buffer push %ecx push %esi mov $4, %ecx lea PixelBuffer, %esi 0: lodsl # red shr $8, %eax # 8 bit precision mov %al, 2(%edi) lodsl # green shr $8, %eax mov %al, 1(%edi) lodsl # blue shr $8, %eax mov %al, (%edi) add $4, %edi lodsl # dummy loop 0b pop %esi pop %ecx ret /*************************************/ /* Functions to go from YUV interlaced formats to RGB */ /* Go from interlaced to RGB, red first */ ENTRY(vcvt_420i_rgb24) enter StackSpace, $0 # no extra space, no stackframes push %ebx push %esi push %edi call test_param_2 jc 9f mov Plus, %eax # 3 bytes per pixel shl $1, %eax add Plus, %eax mov %eax, DstPlus 0: mov Width, %ecx # width push %edi # Save dst pointer 1: call do_four_yuvi call limit_pixels call push_rgb24 cmp $0, %ecx jnz 1b # end of line? pop %edi # Get dst pointer add DstPlus, %edi # Add offset decl Height # yes; decrement line counter jnz 0b 9: pop %edi pop %esi pop %ebx leave ret /* Go from interlaced to BGR, blue first */ ENTRY(vcvt_420i_bgr24) enter StackSpace, $0 # no extra space, no stackframes push %ebx push %esi push %edi call test_param_2 jc 9f mov Plus, %eax # 3 bytes per pixel shl $1, %eax add Plus, %eax mov %eax, DstPlus 0: mov Width, %ecx # width push %edi 1: call do_four_yuvi call limit_pixels call push_bgr24 cmp $0, %ecx jnz 1b # end of line? pop %edi # Get dst pointer add DstPlus, %edi # Add offset decl Height # yes; decrement line counter jnz 0b 9: pop %edi pop %esi pop %ebx leave ret /* From interlaced to RGBa */ ENTRY(vcvt_420i_rgb32) enter StackSpace, $0 # no extra space, no stackframes push %ebx push %esi push %edi call test_param_2 jc 9f mov Plus, %eax # 4 bytes per pixel shl $2, %eax mov %eax, DstPlus 0: mov Width, %ecx # width push %edi 1: call do_four_yuvi call limit_pixels call push_rgb32 cmp $0, %ecx # end of line? jnz 1b pop %edi # Get dst pointer add DstPlus, %edi # Add offset decl Height # yes; decrement line counter jnz 0b 9: pop %edi pop %esi pop %ebx leave ret /* Guess what? Go from interlaced to BGRa */ ENTRY(vcvt_420i_bgr32) enter StackSpace, $0 # no extra space, no stackframes push %ebx push %esi push %edi call test_param_2 jc 9f mov Plus, %eax # 4 bytes per pixel shl $2, %eax mov %eax, DstPlus 0: mov Width, %ecx # width push %edi 1: call do_four_yuvi call limit_pixels call push_bgr32 cmp $0, %ecx # end of line? jnz 1b pop %edi # Get dst pointer add DstPlus, %edi # Add offset decl Height # yes; decrement line counter jnz 0b 9: pop %edi pop %esi pop %ebx leave ret /**************************************************************************/ /* Go from 'interlaced' (YYYY UU/VV) format to planar */ ENTRY(vcvt_420i_420p) enter $80, $0 # 4 bytes extra space, no stackframes push %ebx # -4: width / 4 push %esi push %edi call test_param_13 jc 9f # Okay, this is fairly easy... we first grab the Y values (4 bytes # at a time), then rewind and do the U values, and repeat for V. # This leaves us with a nice planar format mov Width, %eax shr %eax shr %eax # width / 4 mov %eax, -80(%ebp) # Store # Y mov Height, %edx # line counter 0: mov -80(%ebp), %ecx push %edi 1: lodsl # get 4 bytes... stosl # ...push 4 bytes add $2, %esi # Skip U or V loop 1b pop %edi add Plus, %edi dec %edx jnz 0b shrl $1, Plus # divide increment by 2 # U mov Src4, %esi # rewind source pointer mov DstU, %edi add $4, %esi # set to U mov Height, %edx shr %edx # height / 2 mov Width, %ebx shl %ebx add Width, %ebx shr %ebx # Width * 1.5 (line offset) 2: mov -80(%ebp), %ecx # width / 4 push %edi 3: lodsw # 2 bytes at a time stosw add $4, %esi # skip Y loop 3b add %ebx, %esi # Skip line (U is on even lines) pop %edi add Plus, %edi dec %edx jnz 2b # V mov Src4, %esi # rewind, set to V in first odd line add $4, %esi add %ebx, %esi # register re-use; no compiler can beat that :) mov DstV, %edi # V ptr mov Height, %edx shr %edx # height / 2 4: mov -80(%ebp), %ecx # Get width/4 push %edi 5: lodsw stosw add $4, %esi # Skip Y loop 5b add %ebx, %esi # Skip line (V is on odd lines) pop %edi add Plus, %edi dec %edx jnz 4b /* That's it! */ 9: pop %edi pop %esi pop %ebx leave ret /* Go from 4:2:0 interlaced to 'normal' YUYV */ ENTRY(vcvt_420i_yuyv) enter $84, $0 # 8 bytes extra space, no stackframes push %ebx push %esi push %edi call test_param_2 jc 9f mov Width, %ecx # -4: width / 4 = no. loops per line shr %ecx shr %ecx mov %ecx, -80(%ebp) mov Width, %ebx # -8: width * 1.5 = line offset shl %ebx add Width, %ebx shr %ebx mov %ebx, -84(%ebp) # Okay, this requires a bit of byte shuffling... we go from # YYYY UU # YYYY VV # to # YUYV YUYV # YUYV YUYV # which indeed takes up more space # shll Plus # Plus * 2 0: mov -80(%ebp), %ecx push %edi 1: lodsl # 4 Y in eax testl $1, Height # even or odd line? jnz 2f # Even mov -84(%ebp), %ebx mov (%ebx, %esi), %dx # 16 bits V shl $16, %edx # store in high word mov (%esi), %dx # 16 bits U add $2, %esi jmp 3f 2: # Odd mov -84(%ebp), %ebx neg %ebx # negative offset mov (%esi), %dx # 16 bits V shl $16, %edx # store in high word mov (%ebx, %esi), %dx # 16 bits U add $2, %esi 3: # eax = Y3Y2Y1Y0, edx = V1V0U1U0, ebx is free push %eax movzbl %al, %ebx # ______y0 and $0xFF00, %eax # ____y1__ shl $8, %eax # __y1____ or %ebx, %eax # __y1__y0 mov %edx, %ebx # v1v0u1u0 shl $8, %ebx # v0u1u0__ and $0xff00ff00, %ebx # v0__u0__ or %ebx, %eax # v0y1u0y0 stosl pop %eax # y3y2y1y0 # Second half shr $8, %eax # __y3y2y1 shr $8, %ax # __y3__y2 and $0xff00ff00, %edx # v1__u1__ or %edx, %eax # v1y3u1y2 stosl loop 1b pop %edi add Plus, %edi decl Height # height-- jnz 0b # Done 9: pop %edi pop %esi pop %ebx leave ret