APE Structure Change

shreyas_potnis

10th August 2003 15:09 UTC

APE Structure Change
These God damned cyber cafes...something went wrong and everything I typed was blown off.
Anyways, here goes:
Changing the entire sturcture of the APE to make them fast. The render function in AVS is passed the following values:
w,h,visdata,*framebuffer.
w=width, h=height, visdata(an array)=spectrum and all those values
*framebuffer= pointer to first pixel of AVS screen.
We need to modify all the variables from the one framebuffer points and w*h variables procedding them. The colour of the pixels is stored as intefer in the format 00 RR GG BB. So now we have to split the colour and then modify it and then again remake it. Instead what I propose is: Instead of passing a pointer to the colour, use 3 different integers for the colour of a pixel and pass 3 different points *r,*g,*b.
I know this would mean increasing the memory required by 3 times, but spliting each and every pixel and then creating it back is avoided.

eg: (channel shift)

.... Render (..,..,..,int *r, int *g, int *b)
{
int temp, wh;
for(int i=0; i<h; i++)
for(j=0;j<w;j++)
{
wh=i*j;
temp=*(r+wh);
*(r+wh)=*(g+wh);
*(g+wh)=temp;
}

return 0;
}

Simple isnt it?

But making this possible is highly impossible I think.

UnConeD

10th August 2003 15:42 UTC

Hah. You seriously need to learn how to do assembly/mmx coding. Here's the source code for the essential part of channel shift.

It processes 4 pixels in a row (so it reduces the amount of code jumps by 4), but that's the only special thing about it.

        c = w*h;

        switch (config.mode) {
        default:
        case IDC_RGB:
                return 0;
        case IDC_RBG:
                __asm {
                        mov ebx, framebuffer;
                        mov ecx, c;
                        lp1:
                        sub ecx, 4;

                        mov eax, dword ptr [ebx+ecx*4];
                        xchg ah, al;
                        mov [ebx+ecx*4], eax;

                        mov eax, dword ptr [ebx+ecx*4+4];
                        xchg ah, al;
                        mov [ebx+ecx*4+4], eax;

                        mov eax, dword ptr [ebx+ecx*4+8];
                        xchg ah, al;
                        mov [ebx+ecx*4+8], eax;

                        mov eax, dword ptr [ebx+ecx*4+12];
                        xchg ah, al;
                        mov [ebx+ecx*4+12], eax;

                        test ecx, ecx;
                        jnz lp1;
                }
                break;
                
        case IDC_BRG:
                __asm {
                        mov ebx, framebuffer;
                        mov ecx, c;
                        lp2:
                        sub ecx, 4;
                        
                        mov eax, dword ptr [ebx+ecx*4];
                        mov dl, al;
                        shr eax, 8;
                        bswap eax;
                        mov ah, dl;
                        bswap eax;
                        mov [ebx+ecx*4], eax;
                        
                        mov eax, dword ptr [ebx+ecx*4+4];
                        mov dl, al;
                        shr eax, 8;
                        bswap eax;
                        mov ah, dl;
                        bswap eax;
                        mov [ebx+ecx*4+4], eax;

                        mov eax, dword ptr [ebx+ecx*4+8];
                        mov dl, al;
                        shr eax, 8;
                        bswap eax;
                        mov ah, dl;
                        bswap eax;
                        mov [ebx+ecx*4+8], eax;

                        mov eax, dword ptr [ebx+ecx*4+12];
                        mov dl, al;
                        shr eax, 8;
                        bswap eax;
                        mov ah, dl;
                        bswap eax;
                        mov [ebx+ecx*4+12], eax;

                        test ecx, ecx;
                        jnz lp2;
                }
                break;

        case IDC_BGR:
                __asm {
                        mov ebx, framebuffer;
                        mov ecx, c;
                        lp3:
                        sub ecx, 4;

                        mov eax, dword ptr [ebx+ecx*4];
                        bswap eax;
                        shr eax, 8;
                        mov [ebx+ecx*4], eax;

                        mov eax, dword ptr [ebx+ecx*4+4];
                        bswap eax;
                        shr eax, 8;
                        mov [ebx+ecx*4+4], eax;

                        mov eax, dword ptr [ebx+ecx*4+8];
                        bswap eax;
                        shr eax, 8;
                        mov [ebx+ecx*4+8], eax;

                        mov eax, dword ptr [ebx+ecx*4+12];
                        bswap eax;
                        shr eax, 8;
                        mov [ebx+ecx*4+12], eax;

                        test ecx, ecx;
                        jnz lp3;
                }
                break;

        case IDC_GBR:
                __asm {
                        mov ebx, framebuffer;
                        mov ecx, c;
                        lp4:
                        sub ecx, 4;

                        mov eax, dword ptr [ebx+ecx*4];
                        mov edx, eax;
                        bswap edx;
                        shl eax, 8;
                        mov al, dh;
                        mov [ebx+ecx*4], eax;

                        mov eax, dword ptr [ebx+ecx*4+4];
                        mov edx, eax;
                        bswap edx;
                        shl eax, 8;
                        mov al, dh;
                        mov [ebx+ecx*4+4], eax;

                        mov eax, dword ptr [ebx+ecx*4+8];
                        mov edx, eax;
                        bswap edx;
                        shl eax, 8;
                        mov al, dh;
                        mov [ebx+ecx*4+8], eax;

                        mov eax, dword ptr [ebx+ecx*4+12];
                        mov edx, eax;
                        bswap edx;
                        shl eax, 8;
                        mov al, dh;
                        mov [ebx+ecx*4+12], eax;

                        test ecx, ecx;
                        jnz lp4;
                }
                break;

        case IDC_GRB:
                __asm {
                        mov ebx, framebuffer;
                        mov ecx, c;
                        lp5:
                        sub ecx, 4;

                        mov eax, dword ptr [ebx+ecx*4];
                        shl eax, 8;
                        bswap eax;
                        xchg ah, al;
                        bswap eax;
                        shr eax, 8;
                        mov [ebx+ecx*4], eax;

                        mov eax, dword ptr [ebx+ecx*4+4];
                        shl eax, 8;
                        bswap eax;
                        xchg ah, al;
                        bswap eax;
                        shr eax, 8;
                        mov [ebx+ecx*4+4], eax;

                        mov eax, dword ptr [ebx+ecx*4+8];
                        shl eax, 8;
                        bswap eax;
                        xchg ah, al;
                        bswap eax;
                        shr eax, 8;
                        mov [ebx+ecx*4+8], eax;

                        mov eax, dword ptr [ebx+ecx*4+12];
                        shl eax, 8;
                        bswap eax;
                        xchg ah, al;
                        bswap eax;
                        shr eax, 8;
                        mov [ebx+ecx*4+12], eax;

                        test ecx, ecx;
                        jnz lp5;
                }
                break;
        }
        return 0;

But what you're saying IS already possible. Just do this:


unsigned char *r, *g, *b;

b = g = r = (unsigned char *) framebuffer;
b+=2; ++g;

And then skip 4 bytes ahead every time using r+=4;g+=4;b+=4.

But this is wildly inefficient because the x86 has a 32-bit wide memory bus and works much faster when processing entire, dword aligned, 32-bit blocks.

The current memory structure IS the most efficient. You're just not using it correctly.

The same goes for most other C/C++ tricks. You might think "a^=b;b^=a;a^=b;" is a smart way of exchanging vars, but in fact it's wildly inefficient unless you have a compiler that specifically recognizes that code.

[Ishan]

12th August 2003 16:02 UTC

argh..tech stuff:confused:
i'm just happy to use ape's :p not create them:p

yes, i'm a very cheap person;)

jheriko

17th August 2003 07:12 UTC

MMX is great for working with colors. You can perform quite a few equivalents to x86 instructions on all four colour components at once, SSE and SSE2 fill the gaps in MMX but they aren't so widespread.
I'd recommend looking it up.. it actually makes a lot of code easier. Like an RGB<->BGR swap, commonly needed in reading bitmap files, for instance... not that I'd have ever have thought of doing 4 pixels at a time for code speed.

/me notes UCDs idea down on list of things to steal

How could a compiler not directly compile a bitwise XOR statement? What else would it need to do with it? Surely when a piece of C code with a^=b;b^=a;a^=b; is compiled that code is replaced in the assembly with xor a, b xor b, a xor a, b (insert registers and mov as required). P9s always telling me about how immensely efficient C compilers actually are and how you'd think it would be crap but it optimises code really well and junk.

UnConeD

18th August 2003 23:12 UTC

In my experience at least the xor-switch thingy sucks.

This is because the vars are not stored in registers, and the code becomes:

load a -> 1;
load b -> 2;
xor 1,2 -> 1;
store a <- 1;

load b -> 1;
load a -> 2;
xor 1,2 -> 1;
store b <- 1;

load a -> 1;
load b -> 2;
xor 1,2 -> 1;
store a <- 1;

With 1,2 being two CPU registers. This means 6 loads and 3 stores.

Something like that.

The better statement would simply be:

register 'type' temp;
temp = a; a = b; b = temp;

Which a proper compiler should indeed optimize into the following snippet (which I use through inline asm):

__asm {
mov eax, var1;
mov ecx, var2;
mov var1, ecx;
mov var2, eax;
}

2 loads and 2 stores...
It could be that caching simply removes the inefficiency, I really don't know much about modern CPU performance these days.

I've only toyed around with MSVC++ though, I've heard Intel's C++ compiler is much better at optimizing.
The thing is, C/C++ allows you to express certain stuff in really compact code, but the translation to machine code can turn out to be much more complicated on an x86.

I don't know if the 4-pixels-at-a-time actually has any effect on a modern cpu, but a jump used to kill the instruction cache last time I checked. I should really benchmark this stuff ;)... in any case, it probably can't hurt, and because AVS always has multiples of 4 pixels, you don't need extra checking for it to work.