/*
 * jpeg-odct.cc --
 *
 *      FIXME: This file needs a description here.
 *
 * Copyright (c) 1996-2002 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * A. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * B. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * C. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

void
xidct(register short* bp, u_int* mask, u_char* p, int stride)
{
  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  u_int m0 = mask[0];
  u_int m1 = mask[1];

/*FIXME*/
  if (*bp)
	  m0 |= 1;

  for (int rowctr = 0; rowctr < 8; ++rowctr) {
    int tmp0, tmp1, tmp2, tmp3;
    int tmp10, tmp11, tmp12, tmp13;
    int z1, z2, z3, z4, z5;

    /*
     * Due to quantization, we will usually find that many of the input
     * coefficients are zero, especially the AC terms.  We can exploit this
     * by short-circuiting the IDCT calculation for any row in which all
     * the AC terms are zero.  In that case each output is equal to the
     * DC coefficient (with scale factor as needed).
     * With typical images and quantization tables, half or more of the
     * row DCT calculations can be simplified this way.
     */
    if ((m0 & 0xfe) == 0) {
      /* AC terms all zero */
      int v;
     if (m0 & 1) {
	  v = (bp[0] << PASS1_BITS) & 0xffff;
	  v |= v << 16;
      } else
	  v = 0;
      ((u_int*)bp)[0] = v;
      ((u_int*)bp)[1] = v;
      ((u_int*)bp)[2] = v;
      ((u_int*)bp)[3] = v;
      goto nextrow;
    }

    /* Even part: reverse the even part of the forward DCT. */
    /* The rotator is sqrt(2)*c(-6). */
    if (m0 & 1 << 6) {
	int d6 = bp[6];
	if (m0 & 1 << 4) {
	    int d4 = bp[4];
	    if (m0 & 1 << 2) {
	        int d2 = bp[2];
		if (m0 & 1) {
		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
		    int d0 = bp[0];
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    } else {
		if (m0 & 1) {
		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
		    int d0 = bp[0];
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, -FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    }
	} else {
	    if (m0 & 1 << 2) {
	        int d2 = bp[2];
		if (m0 & 1) {
		    /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
		    int d0 = bp[0];
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    } else {
		if (m0 & 1) {
		    /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
		    int d0 = bp[0];
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    }
	}
    } else {
	if (m0 & 1 << 4) {
	    int d4 = bp[4];
	    if (m0 & 1 << 2) {
		int d2 = bp[2];
		if (m0 & 1) {
		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
		    int d0 = bp[0];
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    } else {
		if (m0 & 1) {
		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
		    int d0 = bp[0];
		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
		} else {
		    /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
		    tmp10 = tmp13 = d4 << CONST_BITS;
		    tmp11 = tmp12 = -tmp10;
		}
	    }
	} else {
	    if (m0 & 1 << 2) {
		int d2 = bp[2];
		if (m0 & 1) {
		    /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
		    int d0 = bp[0];
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    } else {
		if (m0 & 1) {
		    /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
		    int d0 = bp[0];
		    tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
		} else {
		    /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
		    tmp10 = tmp13 = tmp11 = tmp12 = 0;
		}
	    }
	}
    }


    /* Odd part per figure 8; the matrix is unitary and hence its
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
     */

    if (m0 & 1 << 7) {
	int d7 = bp[7];
	if (m0 & 1 << 5) {
	    int d5 = bp[5];
	    if (m0 & 1 << 3) {
	        int d3 = bp[3];
		if (m0 & 1 << 1) {
		    int d1 = bp[1];
		    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
		    z1 = d7 + d1;
		    z2 = d5 + d3;
		    z3 = d7 + d3;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(z3 + z4, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
		    z1 = d7;
		    z2 = d5 + d3;
		    z3 = d7 + d3;
		    z5 = MULTIPLY(z3 + d5, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 = z1 + z4;
		}
	    } else {
		if (m0 & 1 << 1) {
		    int d1 = bp[1];
		    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
		    z1 = d7 + d1;
		    z2 = d5;
		    z3 = d7;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(z3 + z4, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 = z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
		    tmp0 = MULTIPLY(d7, - FIX(0.601344887));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    tmp1 = MULTIPLY(d5, - FIX(0.509795578));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));
		    z5 = MULTIPLY(d5 + d7, FIX(1.175875602));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z3;
		    tmp1 += z4;
		    tmp2 = z2 + z3;
		    tmp3 = z1 + z4;
		}
	    }
	} else {
	    if (m0 & 1 << 3) {
		int d3 = bp[3];
		if (m0 & 1 << 1) {
		    int d1 = bp[1];
		    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
		    z1 = d7 + d1;
		    z3 = d7 + d3;
		    z5 = MULTIPLY(z3 + d1, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(d3, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(d1, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 = z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
		    z3 = d7 + d3;

		    tmp0 = MULTIPLY(d7, - FIX(0.601344887));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    tmp2 = MULTIPLY(d3, FIX(0.509795579));
		    z2 = MULTIPLY(d3, - FIX(2.562915447));
		    z5 = MULTIPLY(z3, FIX(1.175875602));
		    z3 = MULTIPLY(z3, - FIX(0.785694958));

		    tmp0 += z3;
		    tmp1 = z2 + z5;
		    tmp2 += z3;
		    tmp3 = z1 + z5;
		}
	    } else {
		if (m0 & 1 << 1) {
		    int d1 = bp[1];
		    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
		    z1 = d7 + d1;
		    z5 = MULTIPLY(z1, FIX(1.175875602));

		    z1 = MULTIPLY(z1, FIX(0.275899379));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    tmp0 = MULTIPLY(d7, - FIX(1.662939224));
		    z4 = MULTIPLY(d1, - FIX(0.390180644));
		    tmp3 = MULTIPLY(d1, FIX(1.111140466));

		    tmp0 += z1;
		    tmp1 = z4 + z5;
		    tmp2 = z3 + z5;
		    tmp3 += z1;
		} else {
		    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
		    tmp0 = MULTIPLY(d7, - FIX(1.387039845));
		    tmp1 = MULTIPLY(d7, FIX(1.175875602));
		    tmp2 = MULTIPLY(d7, - FIX(0.785694958));
		    tmp3 = MULTIPLY(d7, FIX(0.275899379));
		}
	    }
	}
    } else {
	if (m0 & 1 << 5) {
	    int d5 = bp[5];
	    if (m0 & 1 << 3) {
		int d3 = bp[3];
		if (m0 & 1 << 1) {
		    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
		    int d1 = bp[1];
		    z2 = d5 + d3;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(d3 + z4, FIX(1.175875602));

		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(d1, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(d3, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 = z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
		    z2 = d5 + d3;

		    z5 = MULTIPLY(z2, FIX(1.175875602));
		    tmp1 = MULTIPLY(d5, FIX(1.662939225));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));
		    z2 = MULTIPLY(z2, - FIX(1.387039845));
		    tmp2 = MULTIPLY(d3, FIX(1.111140466));
		    z3 = MULTIPLY(d3, - FIX(1.961570560));

		    tmp0 = z3 + z5;
		    tmp1 += z2;
		    tmp2 += z2;
		    tmp3 = z4 + z5;
		}
	    } else {
		if (m0 & 1 << 1) {
		    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
		    int d1 = bp[1];
		    z4 = d5 + d1;

		    z5 = MULTIPLY(z4, FIX(1.175875602));
		    z1 = MULTIPLY(d1, - FIX(0.899976223));
		    tmp3 = MULTIPLY(d1, FIX(0.601344887));
		    tmp1 = MULTIPLY(d5, - FIX(0.509795578));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z4 = MULTIPLY(z4, FIX(0.785694958));

		    tmp0 = z1 + z5;
		    tmp1 += z4;
		    tmp2 = z2 + z5;
		    tmp3 += z4;
		} else {
		    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
		    tmp0 = MULTIPLY(d5, FIX(1.175875602));
		    tmp1 = MULTIPLY(d5, FIX(0.275899380));
		    tmp2 = MULTIPLY(d5, - FIX(1.387039845));
		    tmp3 = MULTIPLY(d5, FIX(0.785694958));
		}
	    }
	} else {
	    if (m0 & 1 << 3) {
		int d3 = bp[3];
		if (m0 & 1 << 1) {
		    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
		    int d1 = bp[1];
		    z5 = d1 + d3;
		    tmp3 = MULTIPLY(d1, FIX(0.211164243));
		    tmp2 = MULTIPLY(d3, - FIX(1.451774981));
		    z1 = MULTIPLY(d1, FIX(1.061594337));
		    z2 = MULTIPLY(d3, - FIX(2.172734803));
		    z4 = MULTIPLY(z5, FIX(0.785694958));
		    z5 = MULTIPLY(z5, FIX(1.175875602));

		    tmp0 = z1 - z4;
		    tmp1 = z2 + z4;
		    tmp2 += z5;
		    tmp3 += z5;
		} else {
		    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
		    tmp0 = MULTIPLY(d3, - FIX(0.785694958));
		    tmp1 = MULTIPLY(d3, - FIX(1.387039845));
		    tmp2 = MULTIPLY(d3, - FIX(0.275899379));
		    tmp3 = MULTIPLY(d3, FIX(1.175875602));
		}
	    } else {
		if (m0 & 1 << 1) {
		    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
		    int d1 = bp[1];
		    tmp0 = MULTIPLY(d1, FIX(0.275899379));
		    tmp1 = MULTIPLY(d1, FIX(0.785694958));
		    tmp2 = MULTIPLY(d1, FIX(1.175875602));
		    tmp3 = MULTIPLY(d1, FIX(1.387039845));
		} else {
		    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
		    tmp0 = tmp1 = tmp2 = tmp3 = 0;
		}
	    }
	}
    }

    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

    bp[0] =  DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
    bp[7] =  DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
    bp[1] =  DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
    bp[6] =  DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
    bp[2] =  DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
    bp[5] =  DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
    bp[3] =  DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
    bp[4] =  DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);

 nextrow:
    bp += 8;		/* advance pointer to next row */
    m0 >>= 8;
    m0 |= m1 << 24;
    m1 >>= 8;
  }

  /* Pass 2: process columns. */
  /* Note that we must descale the results by a factor of 8 == 2**3, */
  /* and also undo the PASS1_BITS scaling. */

  bp -= 64;
  for (rowctr = 8; --rowctr >= 0;) {
    int tmp0, tmp1, tmp2, tmp3;
    int tmp10, tmp11, tmp12, tmp13;
    int z1, z2, z3, z4, z5;

    /* Columns of zeroes can be exploited in the same way as we did with rows.
     * However, the row calculation has created many nonzero AC terms, so the
     * simplification applies less often (typically 5% to 10% of the time).
     * On machines with very fast multiplication, it's possible that the
     * test takes more time than it's worth.  In that case this section
     * may be commented out.
     */

    int d0 = bp[8*0];
    int d1 = bp[8*1];
    int d2 = bp[8*2];
    int d3 = bp[8*3];
    int d4 = bp[8*4];
    int d5 = bp[8*5];
    int d6 = bp[8*6];
    int d7 = bp[8*7];

    /* Even part: reverse the even part of the forward DCT. */
    /* The rotator is sqrt(2)*c(-6). */
    if (d6) {
	if (d4) {
	    if (d2) {
		if (d0) {
		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    } else {
		if (d0) {
		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, -FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    }
	} else {
	    if (d2) {
		if (d0) {
		    /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    } else {
		if (d0) {
		    /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    }
	}
    } else {
	if (d4) {
	    if (d2) {
		if (d0) {
		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    } else {
		if (d0) {
		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
		} else {
		    /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
		    tmp10 = tmp13 = d4 << CONST_BITS;
		    tmp11 = tmp12 = -tmp10;
		}
	    }
	} else {
	    if (d2) {
		if (d0) {
		    /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    } else {
		if (d0) {
		    /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
		    tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
		} else {
		    /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
		    tmp10 = tmp13 = tmp11 = tmp12 = 0;
		}
	    }
	}
    }

    /* Odd part per figure 8; the matrix is unitary and hence its
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
     */
    if (d7) {
	if (d5) {
	    if (d3) {
		if (d1) {
		    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
		    z1 = d7 + d1;
		    z2 = d5 + d3;
		    z3 = d7 + d3;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(z3 + z4, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
		    z1 = d7;
		    z2 = d5 + d3;
		    z3 = d7 + d3;
		    z5 = MULTIPLY(z3 + d5, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 = z1 + z4;
		}
	    } else {
		if (d1) {
		    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
		    z1 = d7 + d1;
		    z2 = d5;
		    z3 = d7;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(z3 + z4, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 = z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
		    tmp0 = MULTIPLY(d7, - FIX(0.601344887));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    tmp1 = MULTIPLY(d5, - FIX(0.509795578));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));
		    z5 = MULTIPLY(d5 + d7, FIX(1.175875602));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z3;
		    tmp1 += z4;
		    tmp2 = z2 + z3;
		    tmp3 = z1 + z4;
		}
	    }
	} else {
	    if (d3) {
		if (d1) {
		    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
		    z1 = d7 + d1;
		    z3 = d7 + d3;
		    z5 = MULTIPLY(z3 + d1, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(d3, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(d1, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 = z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
		    z3 = d7 + d3;

		    tmp0 = MULTIPLY(d7, - FIX(0.601344887));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    tmp2 = MULTIPLY(d3, FIX(0.509795579));
		    z2 = MULTIPLY(d3, - FIX(2.562915447));
		    z5 = MULTIPLY(z3, FIX(1.175875602));
		    z3 = MULTIPLY(z3, - FIX(0.785694958));

		    tmp0 += z3;
		    tmp1 = z2 + z5;
		    tmp2 += z3;
		    tmp3 = z1 + z5;
		}
	    } else {
		if (d1) {
		    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
		    z1 = d7 + d1;
		    z5 = MULTIPLY(z1, FIX(1.175875602));

		    z1 = MULTIPLY(z1, FIX(0.275899379));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    tmp0 = MULTIPLY(d7, - FIX(1.662939224));
		    z4 = MULTIPLY(d1, - FIX(0.390180644));
		    tmp3 = MULTIPLY(d1, FIX(1.111140466));

		    tmp0 += z1;
		    tmp1 = z4 + z5;
		    tmp2 = z3 + z5;
		    tmp3 += z1;
		} else {
		    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
		    tmp0 = MULTIPLY(d7, - FIX(1.387039845));
		    tmp1 = MULTIPLY(d7, FIX(1.175875602));
		    tmp2 = MULTIPLY(d7, - FIX(0.785694958));
		    tmp3 = MULTIPLY(d7, FIX(0.275899379));
		}
	    }
	}
    } else {
	if (d5) {
	    if (d3) {
		if (d1) {
		    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
		    z2 = d5 + d3;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(d3 + z4, FIX(1.175875602));

		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(d1, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(d3, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 = z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
		    z2 = d5 + d3;

		    z5 = MULTIPLY(z2, FIX(1.175875602));
		    tmp1 = MULTIPLY(d5, FIX(1.662939225));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));
		    z2 = MULTIPLY(z2, - FIX(1.387039845));
		    tmp2 = MULTIPLY(d3, FIX(1.111140466));
		    z3 = MULTIPLY(d3, - FIX(1.961570560));

		    tmp0 = z3 + z5;
		    tmp1 += z2;
		    tmp2 += z2;
		    tmp3 = z4 + z5;
		}
	    } else {
		if (d1) {
		    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
		    z4 = d5 + d1;

		    z5 = MULTIPLY(z4, FIX(1.175875602));
		    z1 = MULTIPLY(d1, - FIX(0.899976223));
		    tmp3 = MULTIPLY(d1, FIX(0.601344887));
		    tmp1 = MULTIPLY(d5, - FIX(0.509795578));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z4 = MULTIPLY(z4, FIX(0.785694958));

		    tmp0 = z1 + z5;
		    tmp1 += z4;
		    tmp2 = z2 + z5;
		    tmp3 += z4;
		} else {
		    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
		    tmp0 = MULTIPLY(d5, FIX(1.175875602));
		    tmp1 = MULTIPLY(d5, FIX(0.275899380));
		    tmp2 = MULTIPLY(d5, - FIX(1.387039845));
		    tmp3 = MULTIPLY(d5, FIX(0.785694958));
		}
	    }
	} else {
	    if (d3) {
		if (d1) {
		    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
		    z5 = d1 + d3;
		    tmp3 = MULTIPLY(d1, FIX(0.211164243));
		    tmp2 = MULTIPLY(d3, - FIX(1.451774981));
		    z1 = MULTIPLY(d1, FIX(1.061594337));
		    z2 = MULTIPLY(d3, - FIX(2.172734803));
		    z4 = MULTIPLY(z5, FIX(0.785694958));
		    z5 = MULTIPLY(z5, FIX(1.175875602));

		    tmp0 = z1 - z4;
		    tmp1 = z2 + z4;
		    tmp2 += z5;
		    tmp3 += z5;
		} else {
		    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
		    tmp0 = MULTIPLY(d3, - FIX(0.785694958));
		    tmp1 = MULTIPLY(d3, - FIX(1.387039845));
		    tmp2 = MULTIPLY(d3, - FIX(0.275899379));
		    tmp3 = MULTIPLY(d3, FIX(1.175875602));
		}
	    } else {
		if (d1) {
		    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
		    tmp0 = MULTIPLY(d1, FIX(0.275899379));
		    tmp1 = MULTIPLY(d1, FIX(0.785694958));
		    tmp2 = MULTIPLY(d1, FIX(1.175875602));
		    tmp3 = MULTIPLY(d1, FIX(1.387039845));
		} else {
		    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
		    tmp0 = tmp1 = tmp2 = tmp3 = 0;
		}
	    }
	}
    }

    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

    d0 = DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    p -= stride << 3;
    ++p;
    ++bp;			/* advance pointer to next column */
  }
}

#define FIX_0_298631336  FIX(0.298631336)
#define FIX_0_390180644  FIX(0.390180644)
#define FIX_0_541196100  FIX(0.541196100)
#define FIX_0_765366865  FIX(0.765366865)
#define FIX_0_899976223  FIX(0.899976223)
#define FIX_1_175875602  FIX(1.175875602)
#define FIX_1_501321110  FIX(1.501321110)
#define FIX_1_847759065  FIX(1.847759065)
#define FIX_1_961570560  FIX(1.961570560)
#define FIX_2_053119869  FIX(2.053119869)
#define FIX_2_562915447  FIX(2.562915447)
#define FIX_3_072711026  FIX(3.072711026)

/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
#define DCT_EVEN(bp, t10, t11, t12, t13, stride) \
{ \
	int z2 = bp[2 * stride]; \
	int z3 = bp[6 * stride]; \
	int z1 = (z2 + z3) * FIX(0.541196100); \
	int tmp2 = z1 + z3 * -FIX(1.847759065); \
	int tmp3 = z1 + z2 * FIX(0.765366865); \
 \
	int tmp0 = (bp[0] + bp[4 * stride]) << CONST_BITS; \
	int tmp1 = (bp[0] - bp[4 * stride]) << CONST_BITS; \
 \
	t10 = tmp0 + tmp3; \
	t13 = tmp0 - tmp3; \
	t11 = tmp1 + tmp2; \
	t12 = tmp1 - tmp2; \
}

/*
 * Odd part per figure 8; the matrix is unitary and hence its
 * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 */
#define DCT_ODD(bp, t0, t1, t2, t3, stride) \
{ \
	t0 = bp[stride*7]; \
	t1 = bp[stride*5]; \
	t2 = bp[stride*3]; \
	t3 = bp[stride*1]; \
 \
	int z1 = t0 + t3; \
	int z2 = t1 + t2; \
	int z3 = t0 + t2; \
	int z4 = t1 + t3; \
	int z5 = (z3 + z4) * FIX(1.175875602); /* sqrt(2) * c3 */ \
     \
	t0 *= FIX(0.298631336);	/* sqrt(2) * (-c1+c3+c5-c7) */ \
	t1 *= FIX(2.053119869);	/* sqrt(2) * ( c1+c3-c5+c7) */ \
	t2 *= FIX(3.072711026);	/* sqrt(2) * ( c1+c3+c5-c7) */ \
	t3 *= FIX(1.501321110);	/* sqrt(2) * ( c1+c3-c5-c7) */ \
	z1 *= -FIX(0.899976223);	/* sqrt(2) * (c7-c3) */ \
	z2 *= -FIX(2.562915447);	/* sqrt(2) * (-c1-c3) */ \
	z3 *= -FIX(1.961570560);	/* sqrt(2) * (-c3-c5) */ \
	z4 *= -FIX(0.390180644);	/* sqrt(2) * (c5-c3) */ \
     \
	z3 += z5; \
	z4 += z5; \
     \
	t0 += z1 + z3; \
	t1 += z2 + z4; \
	t2 += z2 + z3; \
	t3 += z1 + z4; \
}

void
idct(register short* bp, u_int* mask, u_char* p, int stride)
{
	u_int m0 = mask[0];
	u_int m1 = mask[1];
	for (int i = 1; i < 64; ++i) {
		m0 >>= 1;
		m0 |= m1 << 31;
		m1 >>= 1;
		if ((m0 & 1) == 0)
			bp[i] = 0;
	}

	/* Pass 1: process columns. */
	/* Note results are scaled up by sqrt(8) compared to a true IDCT; */
	/* furthermore, we scale the results by 2**PASS1_BITS. */

	for (int k = 8; --k >= 0; ) {
		/*
		 * Due to quantization, we will usually find that many of
		 * the input coefficients are zero, especially the AC terms.
		 * We can exploit this by short-circuiting the IDCT
		 * calculation for any row in which all he AC terms are
		 * zero.  In that case each output is equal to the DC
		 * coefficient (with scale factor as needed).  With typical
		 * images and quantization tables, half or more of the
		 * row DCT calculations can be simplified this way.
		 */
		/*FIXME*/

		int e10, e11, e12, e13;
		DCT_EVEN(bp, e10, e11, e12, e13, 8);
		int o0, o1, o2, o3;
		DCT_ODD(bp, o0, o1, o2, o3, 8);

		/* Final output stage */

		bp[8*0] = DESCALE(e10 + o3, CONST_BITS+PASS1_BITS+3);
		bp[8*7] = DESCALE(e10 - o3, CONST_BITS+PASS1_BITS+3);
		bp[8*1] = DESCALE(e11 + o2, CONST_BITS+PASS1_BITS+3);
		bp[8*6] = DESCALE(e11 - o2, CONST_BITS+PASS1_BITS+3);
		bp[8*2] = DESCALE(e12 + o1, CONST_BITS+PASS1_BITS+3);
		bp[8*5] = DESCALE(e12 - o1, CONST_BITS+PASS1_BITS+3);
		bp[8*3] = DESCALE(e13 + o0, CONST_BITS+PASS1_BITS+3);
		bp[8*4] = DESCALE(e13 - o0, CONST_BITS+PASS1_BITS+3);

		++bp;
	}

	/* Pass 2: process rows. */
	/* Note that we must descale the results by a factor of 8 == 2**3, */
	/* and also undo the PASS1_BITS scaling. */

	bp -= 8;
	for (k = 8; --k >= 0; ) {

		int e10, e11, e12, e13;
		DCT_EVEN(bp, e10, e11, e12, e13, 1);
		int o0, o1, o2, o3;
		DCT_ODD(bp, o0, o1, o2, o3, 1);

		/* Final output stage */

		bp[0] = DESCALE(e10 + o3, CONST_BITS-PASS1_BITS);
		bp[7] = DESCALE(e10 - o3, CONST_BITS-PASS1_BITS);
		bp[1] = DESCALE(e11 + o2, CONST_BITS-PASS1_BITS);
		bp[6] = DESCALE(e11 - o2, CONST_BITS-PASS1_BITS);
		bp[2] = DESCALE(e12 + o1, CONST_BITS-PASS1_BITS);
		bp[5] = DESCALE(e12 - o1, CONST_BITS-PASS1_BITS);
		bp[3] = DESCALE(e13 + o0, CONST_BITS-PASS1_BITS);
		bp[4] = DESCALE(e13 - o0, CONST_BITS-PASS1_BITS);

		bp += 8;
	}

	bp -= 64;
	for (int y = 0; y < 8; ++y) {
		for (int x = 0; x < 8; ++x) {
			int t;
			int v = *bp++;
			p[x] = UCLIMIT(v + 128, t);
		}
		p += stride;
	}
}

