/*
  DctVideo.java
 
  An implementation of 8x8 DCT and IDCT for video compression.
  32-bit integer-arithmetic is assumed.
  For DCT operation:
     dct ( int [][] X, int [][] Y );
     X is the input of 8x8 array of samples; values must be within [0, 255]
     Y is the output of 8x8 array of DCT coefficients.
  For IDCT operation:
     idct ( int [][] Y, int [][] X );
     Y is the input of an 8x8 array of DCT coefficients.
     X is the output of an 8x8 array of sample values.

  compile: javac DctVideo.cpp
  To use them, create an object of the class DctVideo:
      DctVideo dct_idct = new DctVideo();
  then call the functions by:
      dct_idct.dct ( ... );
      dct_idct.idct ( ... );

  See Chapter 6 of "An Introduction to Digital Video Data Compression in Java"
    by Fore June. 
*/

class DctVideo {
  private final double PI =  3.141592653589;
  private final short shift = 10;             //10 bits precision for fixed-point arithmetic
  //at the final stage, values have been shifted twice
  private final short shift1 = 2 * shift;       
  private final int fac = 1 << shift ;        //multiply all constants by 2^10 ( = 1024 )
  private final int delta = 1 << (shift-1);   //for rounding adjustment ~ 0.5x2^10
  private final int delta1 = 1 << (shift1-1); //for final rounding adjustment ~ 0.5x2^20
  private final double a = PI / 16.0;         //angle theta

  //DCT constants; use integer-arithmetic.
  private final int c0 = (int) ( 1 / Math.sqrt ( 2 ) * fac );
  private final int c1 = (int) ( Math.cos ( a ) / 2 * fac );
  private final int c2 = (int) ( Math.cos ( 2*a ) / 2 * fac );
  private final int c3 = (int) ( Math.cos ( 3*a ) / 2 * fac );
  private final int c4 = (int) ( Math.cos ( 4*a ) / 2 * fac );
  private final int c5 = (int) ( Math.cos ( 5*a ) / 2 * fac );
  private final int c6 = (int) ( Math.cos ( 6*a ) / 2 * fac );
  private final int c7 = (int) ( Math.cos ( 7*a ) / 2 * fac );

  /*
    DCT function.
    Input: X, array of 8x8, containing data with values in [0, 255].
    Ouput: Y, array of 8x8 DCT coefficients.
  */
  void dct(int [][] X, int [][] Y)
  {
    int   i, j, j1, k;
    int [] x = new int[8], x1 = new int[8];
    int [][]  m = new int[8][8];
    /*
      Row transform 
      i-th row, k-th element 
    */
    for (i = 0, k = 0; i < 8; i++, k++) {
      for (j = 0; j < 8; j++) 
        x[j] = X[k][j];           //data for one row
      
      for (j = 0; j < 4; j++) {   //first stage transform, see flow-graph
        j1 = 7 - j;
        x1[j] = x[j] + x[j1];
        x1[j1] = x[j] - x[j1];
      }
      x[0] = x1[0] + x1[3];       //second stage transform
      x[1] = x1[1] + x1[2];
      x[2] = x1[1] - x1[2];
      x[3] = x1[0] - x1[3];
      x[4] = x1[4];
      //after multiplication, add delta for rounding,
      x[5] = ((x1[6] - x1[5]) * c0 + delta ) >> shift;  
      //shift-right to undo 'x fac' to line up binary points of all x[i]
      x[6] = ((x1[6] + x1[5]) * c0 + delta ) >> shift;  
      x[7] = x1[7];               

      m[i][0] = (x[0]+x[1])*c4;   //upper-half of third (final) stage
      m[i][4] = (x[0] - x[1]) * c4;
      m[i][2] = x[2] * c6 + x[3] * c2;
      m[i][6] = x[3] * c6 - x[2] * c2;

      x1[4] = x[4] + x[5];        //lower-half of third stage
      x1[5] = x[4] - x[5];
      x1[6] = x[7] - x[6];
      x1[7] = x[7] + x[6];
	
      m[i][1] = x1[4]*c7 + x1[7]*c1;//lower-half of fourth (final) stage
      m[i][7] = x1[7] * c7 - x1[4] * c1;
      m[i][5] = x1[5] * c3 + x1[6] * c5;
      m[i][3] = x1[6] * c3 - x1[5] * c5;
    } //for i

    /*
       At this point, coefficients of each row (m[i][j]) has been multiplied
       by 2^10. We can undo the multiplication by << 10 here before doing 
       the vertical transform. However, as we are using int variables, 
       which are 32-bit to do multiplications, we can tolerate another 
       multiplication of 2^10. So we delay our undoing until the end
       of the vertical transform and we undo all left-shift operations 
       by shifting the results right 20 bits ( i.e. << 2 * 10 ).
    */
    // Column transform 
    for (i = 0; i < 8; i++) {                    //eight columns

      //consider one column
      for (j = 0; j < 4; j++) {                 //first-stage operation
        j1 = 7 - j;
        x1[j] = m[j][i] + m[j1][i];
        x1[j1] = m[j][i] - m[j1][i];
      }
                                                //second-stage operation
      x[0] = x1[0] + x1[3];
      x[1] = x1[1] + x1[2];
      x[2] = x1[1] - x1[2];
      x[3] = x1[0] - x1[3];
      x[4] = x1[4];
      //undo one shift for x[5], x[6] to avoid overflow
      x1[5] = (x1[5] + delta) >> shift;
      x1[6] = (x1[6] + delta) >> shift;

      x[5] = (x1[6] - x1[5]) * c0;	
      x[6] = (x1[6] + x1[5]) * c0;
      x[7] = x1[7];

      m[0][i] = (x[0] + x[1]) * c4; //upper-half of third (final) stage
      m[4][i] = (x[0] - x[1]) * c4;
      m[2][i] = ( x[2] * c6 + x[3] * c2 ) ;
      m[6][i] = ( x[3] * c6 - x[2] * c2 );
      
      x1[4] = x[4] + x[5];               //lower-half of third stage
      x1[7] = x[7] + x[6];
      x1[5] = x[4] - x[5];
      x1[6] = x[7] - x[6];

      m[1][i] = x1[4] * c7 + x1[7] * c1; //lower-half of fourth stage
      m[5][i] = x1[5] * c3 + x1[6] * c5;
      m[3][i] = x1[6] * c3 - x1[5] * c5;
      m[7][i] = x1[7] * c7 - x1[4] * c1;
    } // for i

    //we have left-shift (multiplying constants) twice
    //so undo them by right-shift
    for ( i = 0; i < 8; ++i ) {
      for ( j = 0; j < 8; ++j ) {
        Y[i][j] =  ( m[i][j] + delta1 ) >> shift1;    //round the value by adding delta
      }
    }
  }

  /*
    Implementation of idct() is to reverse the operations of dct().
    We first do vertical transform and then horizontal; this is easier
    for debugging as the operations are just the reverse of those in dct();
    of course,it works just as well if you do the horizontal transform first.
    So in this implementation, the first stage of idct() is the final stage 
    of dct() and the final stage of idct() is the first stage of dct().
  */
  void idct(int [][] Y, int [][] X)
  {
    int   j1, i, j;
    int [] x = new int[8], x1 = new int[8], y = new int[8];
    int [][] m = new int[8][8];

    //column transform
    for ( i = 0; i < 8; ++i ) {
      for (j = 0; j < 8; j++) 
        y[j] = Y[j][i];				

      x1[4] = y[1] * c7 - y[7] * c1;   //lower-half final stage of dct
      x1[7] = y[1] * c1 + y[7] * c7;
      x1[6] = y[3] * c3 + y[5] * c5;
      x1[5] = -y[3] * c5 + y[5] * c3;
      
      x[4] = ( x1[4] + x1[5] );        //lower-half of third stage of dct
      x[5] = ( x1[4] - x1[5] );
      x[6] = ( x1[7] - x1[6] );
      x[7] = ( x1[7] + x1[6] );
    
      x1[0] = ( y[0] + y[4] ) * c4;    //upper-half of 3rd (final) dct stage
      x1[1] = ( y[0] - y[4] ) * c4;
      x1[2] = y[2] * c6 - y[6] * c2;
      x1[3] = y[2] * c2 + y[6] * c6;
    
    
      x[0] = ( x1[0] + x1[3] );        //second stage of dct 
      x[1] = ( x1[1] + x1[2] );
      x[2] = ( x1[1] - x1[2] );
      x[3] = ( x1[0] - x1[3] );


      //x[4], x[7] no change
      x1[5] = ((x[6] - x[5]) * c0 + delta ) >> shift;   //after multiplication, add delta for rounding,
      x1[6] = ((x[6] + x[5]) * c0 + delta ) >> shift;   //  shift-right to undo 'x fac' to line up x[]s
      x[5] = x1[5];
      x[6] = x1[6];  
    
      for (j = 0; j < 4; j++) {        //first stage transform of dct
        j1 = 7 - j;
        m[j][i] = (x[j] + x[j1] );
        m[j1][i] = (x[j] - x[j1] );
      }
    } //for i

    //row transform
    for ( i = 0; i < 8; i++ ) {
      for (j = 0; j < 8; j++) 
        y[j] = m[i][j] ;               //data for one row
    
      x1[4] = y[1] * c7 - y[7] * c1;
      x1[7] = y[1] * c1 + y[7] * c7;
      x1[6] = y[3] * c3 + y[5] * c5;
      x1[5] = -y[3] * c5 + y[5] * c3;

      x[4] = ( x1[4] + x1[5] );        //lower-half of third stage
      x[5] = ( x1[4] - x1[5] );
      x[6] = ( x1[7] - x1[6] );
      x[7] = ( x1[7] + x1[6] );
    
      x1[0] = ( y[0] + y[4] ) * c4;
      x1[1] = ( y[0] - y[4] ) * c4;
      x1[2] = y[2] * c6 - y[6] * c2;
      x1[3] = y[2] * c2 + y[6] * c6;
    
      //undo one shift for x[5], x[6] to avoid overflow
      x1[5] = (x[5] + delta) >> shift;
      x1[6] = (x[6] + delta) >> shift;
      x[5] = (x1[6] - x1[5]) * c0;  		
      x[6] = (x1[6] + x1[5]) * c0;  		
      //x[4], x[7] no change
      x[0] = ( x1[0] + x1[3] );        //second stage transform
      x[1] = ( x1[1] + x1[2] );
      x[2] = ( x1[1] - x1[2] );
      x[3] = ( x1[0] - x1[3] );

      for (j = 0; j < 4; j++) {        //1st stage transform,see flow-graph
        j1 = 7 - j;
        m[i][j] = (x[j] + x[j1]);
        m[i][j1] = (x[j] - x[j1]);
      }
    }
    //we have left-shift (multiplying constants) twice
    for ( i = 0; i < 8; ++i ) {
      for ( j = 0; j < 8; ++j ) {
        X[i][j] =  ( m[i][j] + delta1 ) >> shift1;  //round by adding delta
      }
    }
  }
}

