//-------------------------------------------------------------------------------------------------
// Matrox10bitPack.fx
//
// Copyright (c) LWKS Software Ltd.  All Rights Reserved
//-------------------------------------------------------------------------------------------------

texture gSourceImage; // The source image

sampler SourceImageSampler = 
sampler_state
{
   Texture = <gSourceImage>;
   MipFilter = POINT;
   MinFilter = POINT;
   MagFilter = POINT; //GAUSSIANQUAD; //POINT; //LINEAR;
   AddressU  = ClampToEdge;
   AddressV  = ClampToEdge;
};

float pSourceImageTotalWidthInDWORDs;
float DestImageWidth;

float4 ps_main( float2 uv1 : TEXCOORD1 ) : COLOR0
{
/*
   - Called once per output pixel.
   - 8 calls for outputting high 8 bits as YCrYCb
   - 2 calls for outputting extra bits, each call referencing 4 source pixels ?? Can we do enough processing ??
     ^ rough out the above, not caring where the numbers come from.
*/

   // TEXCOORD0: y = output row 0.0->1.0 format. x = output column, 0 - ( pSourceImageTotalWidthInDWORDs - 1 ) format.
   const int   kBlockSizeInComponentValuesI = 32;

   const int   kBaseBlockSizeInDWORDsI = kBlockSizeInComponentValuesI / 4;       // eg, 8
   const float kBaseBlockSizeInDWORDsF = (float)kBaseBlockSizeInDWORDsI;         //     8.0

   const int   kExtrasBlockSizeInDWORDsI = kBlockSizeInComponentValuesI / 16;    //     2
   const float kExtrasBlockSizeInDWORDsF = (float)kExtrasBlockSizeInDWORDsI;     //     2.0

   const int   kTotalBlockSizeInDWORDsI = kBaseBlockSizeInDWORDsI + kExtrasBlockSizeInDWORDsI;  // 10
   const float kTotalBlockSizeInDWORDsF = (float)kTotalBlockSizeInDWORDsI;       // 10

   float x = uv1.x * DestImageWidth;

   float block = floor( ( x / kTotalBlockSizeInDWORDsF ) + ( 0.5 / kTotalBlockSizeInDWORDsF ) );
   float phase = floor( x - ( block * kTotalBlockSizeInDWORDsF ) );

   float4 ret;
   float4 ret2;

   // phase from 0..9, so for 8,9 outputting extras
   if ( phase < kBaseBlockSizeInDWORDsF )
   {
      // Need to write out some 8-bit wide high bytes

      // NOTE: Hmm. Might just be able to output sourcePixel without any
      //       adjustments - depends how the 16 -> 8 bit rounding might work.
      float2 uv;
      uv.y = uv1.y;

      // calculate pixel to sample (in range 0..1
      // (1) rounding required for HAL, not for s/w
      uv.x = ( ( block * kBaseBlockSizeInDWORDsF ) + phase + 0.1 ) / pSourceImageTotalWidthInDWORDs;

      // round to 8bit
      ret = floor( 256.0f * tex2D( SourceImageSampler, uv ) ) / 256.0f * 256.0f / 255.0f;
	  ret2 = ret;
   }
   else
   {
      // Need to write out some 2-bit extras
//phase = 8,9, offset = 0,4
      float offset = ( phase - 8.0 ) * 4.0;  // We want to look at either the first 4 or second 4 DWORDs in the
                                             // block, depending on whether we're outputting the first or second
                                             // DWORD of extra bits - determined by phase.

      // (2) rounding required for HAL, not for s/w
      float blockStart = ( block * kBaseBlockSizeInDWORDsF ) + offset + 0.5;

      float4x2 uv;
// sample the 4 input pixel values

      uv._m01_m11_m21_m31 = uv1.y;
      uv._m00_m10_m20_m30 = blockStart;

      const float4 kAdds = { 0, 1, 2, 3 };

      uv._m00_m10_m20_m30 += kAdds;
      uv._m00_m10_m20_m30 /= pSourceImageTotalWidthInDWORDs;

      row_major float4x4 pixel_values;

      //pixel_values[ 0 ].argb = 256* tex2D( SourceImageSampler, uv[0] ).abgr;
      //pixel_values[ 1 ].argb = 256* tex2D( SourceImageSampler, uv[1] ).abgr;
      //pixel_values[ 2 ].argb = 256* tex2D( SourceImageSampler, uv[2] ).abgr;
      //pixel_values[ 3 ].argb = 256* tex2D( SourceImageSampler, uv[3] ).abgr;

      pixel_values[ 0 ] = 256.0f* tex2D( SourceImageSampler, uv[0] );
      pixel_values[ 1 ] = 256.0f* tex2D( SourceImageSampler, uv[1] );
      pixel_values[ 2 ] = 256.0f* tex2D( SourceImageSampler, uv[2] );
      pixel_values[ 3 ] = 256.0f* tex2D( SourceImageSampler, uv[3] );

      // e.g. 1,2,3,4.xxxx
      // get fractional part (.xxxx)
      pixel_values[ 0 ] = round( frac( pixel_values[ 0 ] ) ) / 4.0;
      pixel_values[ 1 ] = round( frac( pixel_values[ 1 ] ) ) / 4.0;
      pixel_values[ 2 ] = round( frac( pixel_values[ 2 ] ) ) / 4.0;
      pixel_values[ 3 ] = round( frac( pixel_values[ 3 ] ) ) / 4.0;

      // combine and scale to 0..255
      const float4 k2BitMulVec = { 4.0f/255.0f, 16.0f/255.0f, 64.0f/255.0f, 256.0f/255.0f };

	  // sum the input samples
      ret[0] = dot( pixel_values[0], k2BitMulVec );
      ret[1] = dot( pixel_values[1], k2BitMulVec );
      ret[2] = dot( pixel_values[2], k2BitMulVec );
      ret[3] = dot( pixel_values[3], k2BitMulVec );

	  ret2.argb = ret.abgr;
   }

   return ret2;
}

technique T1 { pass P1 { PixelShader = compile PROFILE ps_main(); } }

