/*
 * This file is part of the coreboot project.
 *
 * Copyright (C) 2001 Michael Schroeder
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

/*
 * a tiny jpeg decoder.
 *
 * written in August 2001 by Michael Schroeder <mls@suse.de>
 *
 */

#define __LITTLE_ENDIAN
#include <string.h>
#include "jpeg.h"
#define ISHIFT 11

#define IFIX(a) ((int)((a) * (1 << ISHIFT) + .5))
#define IMULT(a, b) (((a) * (b)) >> ISHIFT)
#define ITOINT(a) ((a) >> ISHIFT)

#ifndef __P
# define __P(x) x
#endif

/* special markers */
#define M_BADHUFF	-1
#define M_EOF		0x80

struct in {
	unsigned char *p;
	unsigned int bits;
	int left;
	int marker;

	int (*func) __P((void *));
	void *data;
};

/*********************************/
struct dec_hufftbl;
struct enc_hufftbl;

union hufftblp {
	struct dec_hufftbl *dhuff;
	struct enc_hufftbl *ehuff;
};

struct scan {
	int dc;			/* old dc value */

	union hufftblp hudc;
	union hufftblp huac;
	int next;		/* when to switch to next scan */

	int cid;		/* component id */
	int hv;			/* horiz/vert, copied from comp */
	int tq;			/* quant tbl, copied from comp */
};

/*********************************/

#define DECBITS 10		/* seems to be the optimum */

struct dec_hufftbl {
	int maxcode[17];
	int valptr[16];
	unsigned char vals[256];
	unsigned int llvals[1 << DECBITS];
};

static void decode_mcus __P((struct in *, int *, int, struct scan *, int *));
static int dec_readmarker __P((struct in *));
static void dec_makehuff __P((struct dec_hufftbl *, int *, unsigned char *));

static void setinput __P((struct in *, unsigned char *));
/*********************************/

#undef PREC
#define PREC int

static void idctqtab __P((unsigned char *, PREC *));
static void idct __P((int *, int *, PREC *, PREC, int));
static void scaleidctqtab __P((PREC *, PREC));

/*********************************/

static void initcol __P((PREC[][64]));

static void col221111 __P((int *, unsigned char *, int));
static void col221111_16 __P((int *, unsigned char *, int));
static void col221111_32 __P((int *, unsigned char *, int));

/*********************************/

#define M_SOI	0xd8
#define M_APP0	0xe0
#define M_DQT	0xdb
#define M_SOF0	0xc0
#define M_DHT   0xc4
#define M_DRI	0xdd
#define M_SOS	0xda
#define M_RST0	0xd0
#define M_EOI	0xd9
#define M_COM	0xfe

static unsigned char *datap;

static int getbyte(void)
{
	return *datap++;
}

static int getword(void)
{
	int c1, c2;
	c1 = *datap++;
	c2 = *datap++;
	return c1 << 8 | c2;
}

struct comp {
	int cid;
	int hv;
	int tq;
};

#define MAXCOMP 4
struct jpginfo {
	int nc;			/* number of components */
	int ns;			/* number of scans */
	int dri;		/* restart interval */
	int nm;			/* mcus til next marker */
	int rm;			/* next restart marker */
};

static struct jpginfo info;
static struct comp comps[MAXCOMP];

static struct scan dscans[MAXCOMP];

static unsigned char quant[4][64];

static struct dec_hufftbl dhuff[4];

#define dec_huffdc (dhuff + 0)
#define dec_huffac (dhuff + 2)

static struct in glob_in;

static int readtables(int till)
{
	int m, l, i, j, lq, pq, tq;
	int tc, th, tt;

	for (;;) {
		if (getbyte() != 0xff)
			return -1;
		m = getbyte();
		if (m == till)
			break;

		switch (m) {
		case 0xc2:
			return 0;

		case M_DQT:
			lq = getword();
			while (lq > 2) {
				pq = getbyte();
				tq = pq & 15;
				if (tq > 3)
					return -1;
				pq >>= 4;
				if (pq != 0)
					return -1;
				for (i = 0; i < 64; i++)
					quant[tq][i] = getbyte();
				lq -= 64 + 1;
			}
			break;

		case M_DHT:
			l = getword();
			while (l > 2) {
				int hufflen[16], k;
				unsigned char huffvals[256];

				tc = getbyte();
				th = tc & 15;
				tc >>= 4;
				tt = tc * 2 + th;
				if (tc > 1 || th > 1)
					return -1;
				for (i = 0; i < 16; i++)
					hufflen[i] = getbyte();
				l -= 1 + 16;
				k = 0;
				for (i = 0; i < 16; i++) {
					for (j = 0; j < hufflen[i]; j++)
						huffvals[k++] = getbyte();
					l -= hufflen[i];
				}
				dec_makehuff(dhuff + tt, hufflen,
					     huffvals);
			}
			break;

		case M_DRI:
			l = getword();
			info.dri = getword();
			break;

		default:
			l = getword();
			while (l-- > 2)
				getbyte();
			break;
		}
	}
	return 0;
}

static void dec_initscans(void)
{
	int i;

	info.nm = info.dri + 1;
	info.rm = M_RST0;
	for (i = 0; i < info.ns; i++)
		dscans[i].dc = 0;
}

static int dec_checkmarker(void)
{
	int i;

	if (dec_readmarker(&glob_in) != info.rm)
		return -1;
	info.nm = info.dri;
	info.rm = (info.rm + 1) & ~0x08;
	for (i = 0; i < info.ns; i++)
		dscans[i].dc = 0;
	return 0;
}

void jpeg_fetch_size(unsigned char *buf, int *width, int *height)
{
	datap = buf;
	getbyte();
	getbyte();
	readtables(M_SOF0);
	getword();
	getbyte();
	*height = getword();
	*width = getword();
}

int jpeg_check_size(unsigned char *buf, int width, int height)
{
	datap = buf;
	getbyte();
	getbyte();
	readtables(M_SOF0);
	getword();
	getbyte();
	if (height != getword() || width != getword())
		return 0;
	return 1;
}

int jpeg_decode(unsigned char *buf, unsigned char *pic,
		int width, int height, int depth, struct jpeg_decdata *decdata)
{
	int i, j, m, tac, tdc;
	int mcusx, mcusy, mx, my;
	int max[6];

	if (!decdata || !buf || !pic)
		return -1;
	datap = buf;
	if (getbyte() != 0xff)
		return ERR_NO_SOI;
	if (getbyte() != M_SOI)
		return ERR_NO_SOI;
	if (readtables(M_SOF0))
		return ERR_BAD_TABLES;
	getword();
	i = getbyte();
	if (i != 8)
		return ERR_NOT_8BIT;
	if (((getword() + 15) & ~15) != height)
		return ERR_HEIGHT_MISMATCH;
	if (((getword() + 15) & ~15) != width)
		return ERR_WIDTH_MISMATCH;
	if ((height & 15) || (width & 15))
		return ERR_BAD_WIDTH_OR_HEIGHT;
	info.nc = getbyte();
	if (info.nc > MAXCOMP)
		return ERR_TOO_MANY_COMPPS;
	for (i = 0; i < info.nc; i++) {
		int h, v;
		comps[i].cid = getbyte();
		comps[i].hv = getbyte();
		v = comps[i].hv & 15;
		h = comps[i].hv >> 4;
		comps[i].tq = getbyte();
		if (h > 3 || v > 3)
			return ERR_ILLEGAL_HV;
		if (comps[i].tq > 3)
			return ERR_QUANT_TABLE_SELECTOR;
	}
	if (readtables(M_SOS))
		return ERR_BAD_TABLES;
	getword();
	info.ns = getbyte();
	if (info.ns != 3)
		return ERR_NOT_YCBCR_221111;
	for (i = 0; i < 3; i++) {
		dscans[i].cid = getbyte();
		tdc = getbyte();
		tac = tdc & 15;
		tdc >>= 4;
		if (tdc > 1 || tac > 1)
			return ERR_QUANT_TABLE_SELECTOR;
		for (j = 0; j < info.nc; j++)
			if (comps[j].cid == dscans[i].cid)
				break;
		if (j == info.nc)
			return ERR_UNKNOWN_CID_IN_SCAN;
		dscans[i].hv = comps[j].hv;
		dscans[i].tq = comps[j].tq;
		dscans[i].hudc.dhuff = dec_huffdc + tdc;
		dscans[i].huac.dhuff = dec_huffac + tac;
	}

	i = getbyte();
	j = getbyte();
	m = getbyte();

	if (i != 0 || j != 63 || m != 0)
		return ERR_NOT_SEQUENTIAL_DCT;

	if (dscans[0].cid != 1 || dscans[1].cid != 2 || dscans[2].cid != 3)
		return ERR_NOT_YCBCR_221111;

	if (dscans[0].hv != 0x22 || dscans[1].hv != 0x11
		|| dscans[2].hv != 0x11)
		return ERR_NOT_YCBCR_221111;

	mcusx = width >> 4;
	mcusy = height >> 4;


	idctqtab(quant[dscans[0].tq], decdata->dquant[0]);
	idctqtab(quant[dscans[1].tq], decdata->dquant[1]);
	idctqtab(quant[dscans[2].tq], decdata->dquant[2]);
	initcol(decdata->dquant);
	setinput(&glob_in, datap);

#if 0
	/* landing zone */
	img[len] = 0;
	img[len + 1] = 0xff;
	img[len + 2] = M_EOF;
#endif

	dec_initscans();

	dscans[0].next = 6 - 4;
	dscans[1].next = 6 - 4 - 1;
	dscans[2].next = 6 - 4 - 1 - 1;	/* 411 encoding */
	for (my = 0; my < mcusy; my++) {
		for (mx = 0; mx < mcusx; mx++) {
			if (info.dri && !--info.nm)
				if (dec_checkmarker())
					return ERR_WRONG_MARKER;

			decode_mcus(&glob_in, decdata->dcts, 6, dscans, max);
			idct(decdata->dcts, decdata->out, decdata->dquant[0],
				IFIX(128.5), max[0]);
			idct(decdata->dcts + 64, decdata->out + 64,
				decdata->dquant[0], IFIX(128.5), max[1]);
			idct(decdata->dcts + 128, decdata->out + 128,
				decdata->dquant[0], IFIX(128.5), max[2]);
			idct(decdata->dcts + 192, decdata->out + 192,
				decdata->dquant[0], IFIX(128.5), max[3]);
			idct(decdata->dcts + 256, decdata->out + 256,
				decdata->dquant[1], IFIX(0.5), max[4]);
			idct(decdata->dcts + 320, decdata->out + 320,
				decdata->dquant[2], IFIX(0.5), max[5]);

			switch (depth) {
			case 32:
				col221111_32(decdata->out, pic
					+ (my * 16 * mcusx + mx) * 16 * 4,
					mcusx * 16 * 4);
				break;
			case 24:
				col221111(decdata->out, pic
					+ (my * 16 * mcusx + mx) * 16 * 3,
					mcusx * 16 * 3);
				break;
			case 16:
				col221111_16(decdata->out, pic
					+ (my * 16 * mcusx + mx) * (16 * 2),
					mcusx * (16 * 2));
				break;
			default:
				return ERR_DEPTH_MISMATCH;
			}
		}
	}

	m = dec_readmarker(&glob_in);
	if (m != M_EOI)
		return ERR_NO_EOI;

	return 0;
}

/****************************************************************/
/**************       huffman decoder             ***************/
/****************************************************************/

static int fillbits __P((struct in *, int, unsigned int));
static int dec_rec2
__P((struct in *, struct dec_hufftbl *, int *, int, int));

static void setinput(struct in *in, unsigned char *p)
{
	in->p = p;
	in->left = 0;
	in->bits = 0;
	in->marker = 0;
}

static int fillbits(struct in *in, int le, unsigned int bi)
{
	int b, m;

	if (in->marker) {
		if (le <= 16)
			in->bits = bi << 16, le += 16;
		return le;
	}
	while (le <= 24) {
		b = *in->p++;
		if (b == 0xff) {
			m = *in->p++;
			if (m != 0) {
				if (m == M_EOF) {
					if (in->func) {
						m = in->func(in->data);
						if (m == 0)
							continue;
					}
				}
				in->marker = m;
				if (le <= 16)
					bi = bi << 16, le += 16;
				break;
			}
		}
		bi = bi << 8 | b;
		le += 8;
	}
	in->bits = bi;		/* tmp... 2 return values needed */
	return le;
}

static int dec_readmarker(struct in *in)
{
	int m;

	in->left = fillbits(in, in->left, in->bits);
	m = in->marker;
	if (m == 0)
		return 0;
	in->left = 0;
	in->marker = 0;
	return m;
}

#define LEBI_DCL	int le, bi
#define LEBI_GET(in)	(le = in->left, bi = in->bits)
#define LEBI_PUT(in)	(in->left = le, in->bits = bi)

#define GETBITS(in, n) (						\
	(le < (n) ? le = fillbits(in, le, bi), bi = in->bits : 0),	\
	(le -= (n)),							\
	bi >> le & ((1 << (n)) - 1)					\
	)

#define UNGETBITS(in, n) (	\
	le += (n)			\
)


static int dec_rec2(struct in *in, struct dec_hufftbl *hu, int *runp, int c,
		int i)
{
	LEBI_DCL;

	LEBI_GET(in);
	if (i) {
		UNGETBITS(in, i & 127);
		*runp = i >> 8 & 15;
		i >>= 16;
	} else {
		for (i = DECBITS; (c = ((c << 1) | GETBITS(in, 1)))
			>= (hu->maxcode[i]); i++)
			;
		if (i >= 16) {
			in->marker = M_BADHUFF;
			return 0;
		}
		i = hu->vals[hu->valptr[i] + c - hu->maxcode[i - 1] * 2];
		*runp = i >> 4;
		i &= 15;
	}
	if (i == 0) {		/* sigh, 0xf0 is 11 bit */
		LEBI_PUT(in);
		return 0;
	}
	/* receive part */
	c = GETBITS(in, i);
	if (c < (1 << (i - 1)))
		c += (-1 << i) + 1;
	LEBI_PUT(in);
	return c;
}

#define DEC_REC(in, hu, r, i)	(		\
	r = GETBITS(in, DECBITS),		\
	i = hu->llvals[r],			\
	i & 128 ?				\
	(					\
		UNGETBITS(in, i & 127),		\
		r = i >> 8 & 15,		\
		i >> 16				\
	)					\
	:					\
	(					\
		LEBI_PUT(in),			\
		i = dec_rec2(in, hu, &r, r, i),	\
		LEBI_GET(in),			\
		i				\
	)					\
)

static void decode_mcus(struct in *in, int *dct, int n, struct scan *sc,
	int *maxp)
{
	struct dec_hufftbl *hu;
	int i, r, t;
	LEBI_DCL;

	memset(dct, 0, n * 64 * sizeof(*dct));
	LEBI_GET(in);
	while (n-- > 0) {
		hu = sc->hudc.dhuff;
		*dct++ = (sc->dc += DEC_REC(in, hu, r, t));

		hu = sc->huac.dhuff;
		i = 63;
		while (i > 0) {
			t = DEC_REC(in, hu, r, t);
			if (t == 0 && r == 0) {
				dct += i;
				break;
			}
			dct += r;
			*dct++ = t;
			i -= r + 1;
		}
		*maxp++ = 64 - i;
		if (n == sc->next)
			sc++;
	}
	LEBI_PUT(in);
}

static void dec_makehuff(struct dec_hufftbl *hu, int *hufflen,
	unsigned char *huffvals)
{
	int code, k, i, j, d, x, c, v;
	for (i = 0; i < (1 << DECBITS); i++)
		hu->llvals[i] = 0;

/*
 * llvals layout:
 *
 * value v already known, run r, backup u bits:
 *  vvvvvvvvvvvvvvvv 0000 rrrr 1 uuuuuuu
 * value unknown, size b bits, run r, backup u bits:
 *  000000000000bbbb 0000 rrrr 0 uuuuuuu
 * value and size unknown:
 *  0000000000000000 0000 0000 0 0000000
 */
	code = 0;
	k = 0;
	for (i = 0; i < 16; i++, code <<= 1) {	/* sizes */
		hu->valptr[i] = k;
		for (j = 0; j < hufflen[i]; j++) {
			hu->vals[k] = *huffvals++;
			if (i < DECBITS) {
				c = code << (DECBITS - 1 - i);
				v = hu->vals[k] & 0x0f;	/* size */
				for (d = 1 << (DECBITS - 1 - i); --d >= 0;) {
					/* both fit in table */
					if (v + i < DECBITS) {
						x = d >> (DECBITS - 1 - v -
							  i);
						if (v && x < (1 << (v - 1)))
							x += (-1 << v) + 1;
						x = x << 16 | (hu->vals[k]
							& 0xf0) << 4 |
							(DECBITS - (i + 1 + v))
							| 128;
					} else
						x = v << 16 | (hu->vals[k]
							& 0xf0) << 4 |
							(DECBITS - (i + 1));
					hu->llvals[c | d] = x;
				}
			}
			code++;
			k++;
		}
		hu->maxcode[i] = code;
	}
	hu->maxcode[16] = 0x20000;	/* always terminate decode */
}

/****************************************************************/
/**************             idct                  ***************/
/****************************************************************/

#define ONE ((PREC)IFIX(1.))
#define S2  ((PREC)IFIX(0.382683432))
#define C2  ((PREC)IFIX(0.923879532))
#define C4  ((PREC)IFIX(0.707106781))

#define S22 ((PREC)IFIX(2 * 0.382683432))
#define C22 ((PREC)IFIX(2 * 0.923879532))
#define IC4 ((PREC)IFIX(1 / 0.707106781))

#define C3IC1 ((PREC)IFIX(0.847759065))	/* c3/c1 */
#define C5IC1 ((PREC)IFIX(0.566454497))	/* c5/c1 */
#define C7IC1 ((PREC)IFIX(0.198912367))	/* c7/c1 */

#define XPP(a, b) (t = a + b, b = a - b, a = t)
#define XMP(a, b) (t = a - b, b = a + b, a = t)
#define XPM(a, b) (t = a + b, b = b - a, a = t)

#define ROT(a, b, s, c) (t = IMULT(a + b, s),	\
			a = IMULT(a, c - s) + t,	\
			b = IMULT(b, c + s) - t)

#define IDCT				\
(					\
	XPP(t0, t1),			\
	XMP(t2, t3),			\
	t2 = IMULT(t2, IC4) - t3,	\
	XPP(t0, t3),			\
	XPP(t1, t2),			\
	XMP(t4, t7),			\
	XPP(t5, t6),			\
	XMP(t5, t7),			\
	t5 = IMULT(t5, IC4),		\
	ROT(t4, t6, S22, C22),		\
	t6 -= t7,			\
	t5 -= t6,			\
	t4 -= t5,			\
	XPP(t0, t7),			\
	XPP(t1, t6),			\
	XPP(t2, t5),			\
	XPP(t3, t4)			\
)

static unsigned char zig2[64] = {
	0, 2, 3, 9, 10, 20, 21, 35,
	14, 16, 25, 31, 39, 46, 50, 57,
	5, 7, 12, 18, 23, 33, 37, 48,
	27, 29, 41, 44, 52, 55, 59, 62,
	15, 26, 30, 40, 45, 51, 56, 58,
	1, 4, 8, 11, 19, 22, 34, 36,
	28, 42, 43, 53, 54, 60, 61, 63,
	6, 13, 17, 24, 32, 38, 47, 49
};

void idct(int *in, int *out, PREC *lquant, PREC off, int max)
{
	PREC t0, t1, t2, t3, t4, t5, t6, t7, t;
	PREC tmp[64], *tmpp;
	int i, j;
	unsigned char *zig2p;

	t0 = off;
	if (max == 1) {
		t0 += in[0] * lquant[0];
		for (i = 0; i < 64; i++)
			out[i] = ITOINT(t0);
		return;
	}
	zig2p = zig2;
	tmpp = tmp;
	for (i = 0; i < 8; i++) {
		j = *zig2p++;
		t0 += in[j] * lquant[j];
		j = *zig2p++;
		t5 = in[j] * lquant[j];
		j = *zig2p++;
		t2 = in[j] * lquant[j];
		j = *zig2p++;
		t7 = in[j] * lquant[j];
		j = *zig2p++;
		t1 = in[j] * lquant[j];
		j = *zig2p++;
		t4 = in[j] * lquant[j];
		j = *zig2p++;
		t3 = in[j] * lquant[j];
		j = *zig2p++;
		t6 = in[j] * lquant[j];
		IDCT;
		tmpp[0 * 8] = t0;
		tmpp[1 * 8] = t1;
		tmpp[2 * 8] = t2;
		tmpp[3 * 8] = t3;
		tmpp[4 * 8] = t4;
		tmpp[5 * 8] = t5;
		tmpp[6 * 8] = t6;
		tmpp[7 * 8] = t7;
		tmpp++;
		t0 = 0;
	}
	for (i = 0; i < 8; i++) {
		t0 = tmp[8 * i + 0];
		t1 = tmp[8 * i + 1];
		t2 = tmp[8 * i + 2];
		t3 = tmp[8 * i + 3];
		t4 = tmp[8 * i + 4];
		t5 = tmp[8 * i + 5];
		t6 = tmp[8 * i + 6];
		t7 = tmp[8 * i + 7];
		IDCT;
		out[8 * i + 0] = ITOINT(t0);
		out[8 * i + 1] = ITOINT(t1);
		out[8 * i + 2] = ITOINT(t2);
		out[8 * i + 3] = ITOINT(t3);
		out[8 * i + 4] = ITOINT(t4);
		out[8 * i + 5] = ITOINT(t5);
		out[8 * i + 6] = ITOINT(t6);
		out[8 * i + 7] = ITOINT(t7);
	}
}

static unsigned char zig[64] = {
	0, 1, 5, 6, 14, 15, 27, 28,
	2, 4, 7, 13, 16, 26, 29, 42,
	3, 8, 12, 17, 25, 30, 41, 43,
	9, 11, 18, 24, 31, 40, 44, 53,
	10, 19, 23, 32, 39, 45, 52, 54,
	20, 22, 33, 38, 46, 51, 55, 60,
	21, 34, 37, 47, 50, 56, 59, 61,
	35, 36, 48, 49, 57, 58, 62, 63
};

static PREC aaidct[8] = {
	IFIX(0.3535533906), IFIX(0.4903926402),
	IFIX(0.4619397663), IFIX(0.4157348062),
	IFIX(0.3535533906), IFIX(0.2777851165),
	IFIX(0.1913417162), IFIX(0.0975451610)
};


static void idctqtab(unsigned char *qin, PREC *qout)
{
	int i, j;

	for (i = 0; i < 8; i++)
		for (j = 0; j < 8; j++)
			qout[zig[i * 8 + j]] = qin[zig[i * 8 + j]] *
						IMULT(aaidct[i], aaidct[j]);
}

static void scaleidctqtab(PREC *q, PREC sc)
{
	int i;

	for (i = 0; i < 64; i++)
		q[i] = IMULT(q[i], sc);
}

/****************************************************************/
/**************          color decoder            ***************/
/****************************************************************/

#define ROUND

/*
 * YCbCr Color transformation:
 *
 * y:0..255   Cb:-128..127   Cr:-128..127
 *
 *      R = Y                + 1.40200 * Cr
 *      G = Y - 0.34414 * Cb - 0.71414 * Cr
 *      B = Y + 1.77200 * Cb
 *
 * =>
 *      Cr *= 1.40200;
 *      Cb *= 1.77200;
 *      Cg = 0.19421 * Cb + .50937 * Cr;
 *      R = Y + Cr;
 *      G = Y - Cg;
 *      B = Y + Cb;
 *
 * =>
 *      Cg = (50 * Cb + 130 * Cr + 128) >> 8;
 */

static void initcol(PREC q[][64])
{
	scaleidctqtab(q[1], IFIX(1.77200));
	scaleidctqtab(q[2], IFIX(1.40200));
}

/* This is optimized for the stupid sun SUNWspro compiler. */
#define STORECLAMP(a, x)			\
(						\
	(a) = (x),				\
	(unsigned int)(x) >= 256 ?		\
	((a) = (x) < 0 ? 0 : 255)		\
	:					\
	0					\
)

#define CLAMP(x) ((unsigned int)(x) >= 256 ? ((x) < 0 ? 0 : 255) : (x))

#ifdef ROUND

#define CBCRCG(yin, xin)			\
(						\
	cb = outc[0  + yin * 8 + xin],		\
	cr = outc[64 + yin * 8 + xin],		\
	cg = (50 * cb + 130 * cr + 128) >> 8	\
)

#else

#define CBCRCG(yin, xin)			\
(						\
	cb = outc[0  + yin*8 + xin],		\
	cr = outc[64 + yin*8 + xin],		\
	cg = (3 * cb + 8 * cr) >> 4		\
)

#endif

#define PIC(yin, xin, p, xout)			\
(						\
	y = outy[(yin) * 8 + xin],		\
	STORECLAMP(p[(xout) * 3 + 0], y + cr),	\
	STORECLAMP(p[(xout) * 3 + 1], y - cg),	\
	STORECLAMP(p[(xout) * 3 + 2], y + cb)	\
)

#ifdef __LITTLE_ENDIAN
#define PIC_16(yin, xin, p, xout, add)				\
(								\
	y = outy[(yin) * 8 + xin],				\
	y = ((CLAMP(y + cr + add*2+1) & 0xf8) <<  8) |		\
		((CLAMP(y - cg + add)     & 0xfc) <<  3) |	\
		((CLAMP(y + cb + add*2+1))        >>  3),	\
	p[(xout) * 2 + 0] = y & 0xff,				\
	p[(xout) * 2 + 1] = y >> 8				\
)
#else
#ifdef CONFIG_PPC
#define PIC_16(yin, xin, p, xout, add)				\
(								\
	y = outy[(yin) * 8 + xin],				\
	y = ((CLAMP(y + cr + add*2+1) & 0xf8) <<  7) |		\
		((CLAMP(y - cg + add*2+1) & 0xf8) <<  2) |	\
		((CLAMP(y + cb + add*2+1))        >>  3),	\
	p[(xout) * 2 + 0] = y >> 8,				\
	p[(xout) * 2 + 1] = y & 0xff				\
)
#else
#define PIC_16(yin, xin, p, xout, add)				\
(								\
	y = outy[(yin) * 8 + xin],				\
	y = ((CLAMP(y + cr + add*2+1) & 0xf8) <<  8) |		\
		((CLAMP(y - cg + add)     & 0xfc) <<  3) |	\
		((CLAMP(y + cb + add*2+1))        >>  3),	\
	p[(xout) * 2 + 0] = y >> 8,				\
	p[(xout) * 2 + 1] = y & 0xff				\
)
#endif
#endif

#define PIC_32(yin, xin, p, xout)		\
(						\
	y = outy[(yin) * 8 + xin],		\
	STORECLAMP(p[(xout) * 4 + 0], y + cr),	\
	STORECLAMP(p[(xout) * 4 + 1], y - cg),	\
	STORECLAMP(p[(xout) * 4 + 2], y + cb),	\
	p[(xout) * 4 + 3] = 0			\
)

#define PIC221111(xin)							\
(									\
	CBCRCG(0, xin),							\
	PIC(xin / 4 * 8 + 0, (xin & 3) * 2 + 0, pic0, xin * 2 + 0),	\
	PIC(xin / 4 * 8 + 0, (xin & 3) * 2 + 1, pic0, xin * 2 + 1),	\
	PIC(xin / 4 * 8 + 1, (xin & 3) * 2 + 0, pic1, xin * 2 + 0),	\
	PIC(xin / 4 * 8 + 1, (xin & 3) * 2 + 1, pic1, xin * 2 + 1)	\
)

#define PIC221111_16(xin)						       \
(									       \
	CBCRCG(0, xin),							       \
	PIC_16(xin / 4 * 8 + 0, (xin & 3) * 2 + 0, pic0, xin * 2 + 0, 3),      \
	PIC_16(xin / 4 * 8 + 0, (xin & 3) * 2 + 1, pic0, xin * 2 + 1, 0),      \
	PIC_16(xin / 4 * 8 + 1, (xin & 3) * 2 + 0, pic1, xin * 2 + 0, 1),      \
	PIC_16(xin / 4 * 8 + 1, (xin & 3) * 2 + 1, pic1, xin * 2 + 1, 2)       \
)

#define PIC221111_32(xin)						\
(									\
	CBCRCG(0, xin),							\
	PIC_32(xin / 4 * 8 + 0, (xin & 3) * 2 + 0, pic0, xin * 2 + 0),	\
	PIC_32(xin / 4 * 8 + 0, (xin & 3) * 2 + 1, pic0, xin * 2 + 1),	\
	PIC_32(xin / 4 * 8 + 1, (xin & 3) * 2 + 0, pic1, xin * 2 + 0),	\
	PIC_32(xin / 4 * 8 + 1, (xin & 3) * 2 + 1, pic1, xin * 2 + 1)	\
)

static void col221111(int *out, unsigned char *pic, int width)
{
	int i, j, k;
	unsigned char *pic0, *pic1;
	int *outy, *outc;
	int cr, cg, cb, y;

	pic0 = pic;
	pic1 = pic + width;
	outy = out;
	outc = out + 64 * 4;
	for (i = 2; i > 0; i--) {
		for (j = 4; j > 0; j--) {
			for (k = 0; k < 8; k++)
				PIC221111(k);
			outc += 8;
			outy += 16;
			pic0 += 2 * width;
			pic1 += 2 * width;
		}
		outy += 64 * 2 - 16 * 4;
	}
}

static void col221111_16(int *out, unsigned char *pic, int width)
{
	int i, j, k;
	unsigned char *pic0, *pic1;
	int *outy, *outc;
	int cr, cg, cb, y;

	pic0 = pic;
	pic1 = pic + width;
	outy = out;
	outc = out + 64 * 4;
	for (i = 2; i > 0; i--) {
		for (j = 4; j > 0; j--) {
			for (k = 0; k < 8; k++)
				PIC221111_16(k);
			outc += 8;
			outy += 16;
			pic0 += 2 * width;
			pic1 += 2 * width;
		}
		outy += 64 * 2 - 16 * 4;
	}
}

static void col221111_32(int *out, unsigned char *pic, int width)
{
	int i, j, k;
	unsigned char *pic0, *pic1;
	int *outy, *outc;
	int cr, cg, cb, y;

	pic0 = pic;
	pic1 = pic + width;
	outy = out;
	outc = out + 64 * 4;
	for (i = 2; i > 0; i--) {
		for (j = 4; j > 0; j--) {
			for (k = 0; k < 8; k++)
				PIC221111_32(k);
			outc += 8;
			outy += 16;
			pic0 += 2 * width;
			pic1 += 2 * width;
		}
		outy += 64 * 2 - 16 * 4;
	}
}