/* Chunky Routines v1.3 */

#include <stdlib.h>
#include <stdio.h>
#include <exec/types.h>

typedef void (*readPixel)(APTR data, UBYTE index, UBYTE pixel);
typedef UBYTE (*writePixel)(APTR data, UBYTE index);

/* Read pixel line (4-bitplanes) */
void readPixelLine4(ULONG *plane0, ULONG *plane1, ULONG *plane2, ULONG *plane3, ULONG *dest, ULONG *end)
{
    register ULONG mask = 0x88888888;
    register ULONG work1, work2, work3, work4;
    register WORD cur, i;

    while (dest < end)
    {
        work1 = *plane0++;
        work2 = *plane1++;
        work3 = *plane2++;
        work4 = *plane3++;

        for (i = 0; i < 4; i++)
        {
            *dest++ = ((((((work1 & mask) >> 1) | (work2 & mask)) >> 1) | (work3 & mask)) >> 1) | (work4 & mask);

            work1 <<= 1;
            work2 <<= 1;
            work3 <<= 1;
            work4 <<= 1;
        }
    }
}

/* Write pixel line */
void writePixelLine4(ULONG *data, ULONG *end, ULONG *plane0, ULONG *plane1, ULONG *plane2, ULONG *plane3)
{
    register ULONG mask = 0x88888888;
    register ULONG work1, work2, work3, work4;

    while (data < end)
    {
        work1 = *data++; /* Pixels 0, 4, 8, 12, 16, 20, 24, 28 */
        work2 = *data++;
        work3 = *data++;
        work4 = *data++;

        *plane3++ = ((((((work4 & mask) >> 1) | (work3 & mask)) >> 1) | (work2 & mask)) >> 1) | (work1 & mask);

        work1 <<= 1;
        work2 <<= 1;
        work3 <<= 1;
        work4 <<= 1;

        *plane2++ = ((((((work4 & mask) >> 1) | (work3 & mask)) >> 1) | (work2 & mask)) >> 1) | (work1 & mask);

        work1 <<= 1;
        work2 <<= 1;
        work3 <<= 1;
        work4 <<= 1;

        *plane1++ = ((((((work4 & mask) >> 1) | (work3 & mask)) >> 1) | (work2 & mask)) >> 1) | (work1 & mask);

        work1 <<= 1;
        work2 <<= 1;
        work3 <<= 1;
        work4 <<= 1;

        *plane0++ = ((((((work4 & mask) >> 1) | (work3 & mask)) >> 1) | (work2 & mask)) >> 1) | (work1 & mask);
    }
}

/* 8-bit Linear proper pixel processing */
void processReadChunkyPixels(ULONG *low, ULONG *high, readPixel process, APTR data)
{  
    register ULONG mask = 0x0f0f0f0f, comp = ~(mask);
    register ULONG work1, work2, work3, work4, i;

    for (i = 0; i < 4; i++)
    {
        /* Merge into byte pixels */
        /* 0, 4, 8, 12, 16, 20, 24, 28 */
        work3 |= work1 ^= work3 = (((work1 = *low++) & mask) >> 4); /* 0, 8, 16, 24 */
        work4 |= work2 ^= work4 = (work2 = *high++) & mask; /* 4, 12, 20, 28 */

        process(data, 24 + i, (UBYTE)work3);
        process(data, 28 + i, (UBYTE)work4);
        work3 >>= 8;
        work4 >>= 8;
        process(data, 16 + i, (UBYTE)work3);
        process(data, 20 + i, (UBYTE)work4);
        work3 >>= 8;
        work4 >>= 8;
        process(data, 8 + i, (UBYTE)work3);
        process(data, 12 + i, (UBYTE)work4);
        work3 >>= 8;
        work4 >>= 8;
        process(data, 0 + i, (UBYTE)work3);
        process(data, 4 + i, (UBYTE)work4);
    }
}

void processWriteChunkyPixels(ULONG *low, ULONG *high, writePixel process, APTR data)
{
    register ULONG mask = 0x0f0f0f0f, comp = ~(mask);
    register ULONG work1, work2;

    for (i = 0; i < 4; i++)
    {
        work1 = (((((process(data, 0 + i) << 8) | process(data, 8 + i)) << 8) | process(data, 16 + i)) << 8) | process(data, 24 + i);
        work2 = (((((process(data, 4 + i) << 8) | process(data, 12 + i)) << 8) | process(data, 20 + i)) << 8) | process(data, 28 + i);
           
        /* Split back to 4-bits pixels */
        *low++ = ((work1 & mask) << 4) | (work2 & mask);        
        *high++ = (work1 & comp) | ((work2 & comp) >> 4);
    }
}

void testReadPixel(UBYTE *buffer, UBYTE index, UBYTE pixel)
{
    buffer[index] = pixel;
}

UBYTE testWritePixel(UBYTE *buffer, UBYTE index)
{
    return(buffer[index]);
}

int main()
{
    WORD i, j;
    ULONG low[4], high[4]; /* Chunky-pixel low and high bits */
    ULONG plane[8];
    UBYTE buffer[32];

    printf("Read pixel line\n");
    readPixelLine4(plane, plane + 1, plane + 2, plane + 3, low, low + 4);
    readPixelLine4(plane + 4, plane + 5, plane + 6, plane + 7, high, high + 4);

    processReadChunkyPixels(low, high, testReadPixel, buffer);

    printf("Write pixel line\n");
    processWriteChunkyPixels(low, high, testWritePixel, buffer);

    writePixelLine4(low, low + 4, plane, plane + 1, plane + 2, plane + 3);
    writePixelLine4(high, high + 4, plane + 4, plane + 5, plane + 6, plane + 7);

    return(0);
}