/** RESIZE_IMAGE { **/
#ifndef RESIZE_IMAGE_H_INCLUDED_8aug24
#define RESIZE_IMAGE_H_INCLUDED_8aug24
#include <stdint.h>

void resize_single_channel(uint16_t weight, int iW, int iH, uint8_t *input,
                          uint8_t *output);

// rgb -> rgba : fill alpha with 0xff
void resize_24_32a(uint16_t weight,
                   int iW, int iH, uint8_t *input, int input_bpl,
                   uint8_t *output, int output_bpl);

// rgb -> rgba : skip the alpha part
void resize_24_32b(uint16_t weight,
                   int iW, int iH, uint8_t *input, int input_bpl,
                   uint8_t *output, int output_bpl);

int  resize_fit_width(int iW, int iH, int reqW,
                      int *oW, int *oH, uint16_t *weight);

#endif

/** IMPLEMENTATION { */
#ifdef RESIZE_IMAGE_IMPLEMENTATION
#include <stdlib.h>
#include <math.h>

#define FRAC_BITS 15
#define COLOR_BITS 10
#define FIXED_ONE (1<<FRAC_BITS)

static uint16_t s2l[256];
static unsigned char l2s[(1<<COLOR_BITS)+1];
static void init_tables()
{
  static int init_ok;
  if (!init_ok)
  {
    for(int i=0;i<256;i++)
      s2l[i]= (1<<COLOR_BITS) * pow( i/255.0, 2.2 );
    for(int i=0;i<=(1<<COLOR_BITS);i++)
      l2s[i]= pow( (double)i / (1<<COLOR_BITS), 1/2.2)*255;
    init_ok= 1;
  }
} 

/**
  we need some terminology so that we don't mess up 

  input: input
  fresh:  s2l[ input ]
  cooked: result of downscaling one row
  served: the row accumulator
  output: output
  
  xtotal: total weight on a row
  ytotal: total weight on an image

  pix:    pixel accumulator

  fresh: COLOR_BITS
  cooked: COLOR_BITS + FRAC_BITS
  served: COLOR_BITS + FRAC_BITS + FRAC_BITS
  output: l2s[ served >> FRAC_BITS ]

 with FRAC_BITS at 15, this doesn't fit into an uint32_t. therefore I need
 to do the shifting during the additions, I think.
*/

void resize_single_channel(uint16_t weight, int iW, int iH, uint8_t *input,
                          uint8_t *output)
{
  // we need oW pixels for translated input
  //         oW pixels for the accumulator
  int oW = weight*iW >> FRAC_BITS;

  uint8_t  *data= malloc(2*oW*sizeof(uint32_t)+iW*sizeof(uint16_t));
  uint32_t *const sS= (void*) data,    *const sE= sS + oW, *sp;
  uint16_t *const fS= (void*) sE,      *const fE= fS+iW, *fp; 
  uint16_t *const cS= (void*) fE,      *cp;
  uint32_t  totaly, totalx;
  uint32_t  P, pix;
  int y;

  init_tables();

  for(sp=sS;sp<sE;sp++) *sp= 0;
  for(y=0,totaly=0;y<iH;y++)
  {
    for(fp=fS;fp<fE;fp++,input++) *fp= s2l[*input];
    for(fp=fS, totalx=pix=0,cp=cS;fp<fE && (P=*fp,1);fp++)
      if (totalx+weight>=FIXED_ONE)
      {
        *cp++= (pix + P * (FIXED_ONE-totalx)) >> FRAC_BITS; 
        totalx = totalx + weight - FIXED_ONE;
        pix= totalx * P;
      }
      else
      {
        pix += weight * P;
        totalx += weight;
      }

    if (totaly+weight>=FIXED_ONE)
    {
      uint32_t parwe= FIXED_ONE-totaly;
      totaly = totaly + weight - FIXED_ONE;
      for(sp=sS,cp=cS;sp<sE;sp++, cp++)
      {
        *output++= l2s[ (*sp + *cp * parwe) >> FRAC_BITS ]; 
        *sp= *cp * totaly;
      }
    }
    else
    {
      for(sp=sS,cp=cS;sp<sE;sp++, cp++) *sp += *cp * weight;
      totaly += weight;
    }
  }

  free(data);
}

// rgb -> rgba : fill alpha with 0xff
void resize_24_32a(uint16_t weight,
                   int iW, int iH, uint8_t *input, int input_bpl,
                   uint8_t *output, int output_bpl)
{
  int oW = weight*iW >> FRAC_BITS;

  uint32_t data_size=   oW*3*sizeof(uint32_t) + 
                      2*iW*3*sizeof(uint16_t);
  uint8_t  *data= malloc(data_size);
  uint32_t *const sS= (void*) data,    *const sE= sS + 3*oW, *sp;
  uint16_t *const fS= (void*) sE,      *const fE= fS + 3*iW, *fp; 
  uint16_t *const cS= (void*) fE,      *cp;
  uint32_t  totaly, totalx;
  uint32_t  P0, P1, P2, pix0, pix1, pix2;
  const uint32_t input_rowdelta= input_bpl - 3*iW;
  const uint32_t output_rowdelta= output_bpl - 4*oW;

  init_tables();

  for(sp=sS;sp<sE;sp++) *sp= 0;
  for(totaly=0;iH;iH--)
  {
    for(fp=fS;fp<fE;fp++,input++) *fp= s2l[*input];
    input += input_rowdelta;
    for(fp=fS, totalx=pix0=pix1=pix2=0,cp=cS;
        fp<fE && (P0=*fp++,P1=*fp++,P2=*fp++,1);)
      if (totalx+weight>=FIXED_ONE)
      {
        *cp++= (pix0 + P0 * (FIXED_ONE-totalx)) >> FRAC_BITS; 
        *cp++= (pix1 + P1 * (FIXED_ONE-totalx)) >> FRAC_BITS; 
        *cp++= (pix2 + P2 * (FIXED_ONE-totalx)) >> FRAC_BITS; 
        totalx = totalx + weight - FIXED_ONE;
        pix0= totalx * P0;
        pix1= totalx * P1;
        pix2= totalx * P2;
      }
      else
      {
        pix0 += weight * P0;
        pix1 += weight * P1;
        pix2 += weight * P2;
        totalx += weight;
      }

    if (totaly+weight>=FIXED_ONE)
    {
      uint32_t parwe= FIXED_ONE-totaly;
      totaly = totaly + weight - FIXED_ONE;
      for(sp=sS,cp=cS;sp<sE;)
      {
        *output++= l2s[ (*sp + *cp * parwe) >> FRAC_BITS ]; 
        *sp++= *cp++ * totaly;
        *output++= l2s[ (*sp + *cp * parwe) >> FRAC_BITS ]; 
        *sp++= *cp++ * totaly;
        *output++= l2s[ (*sp + *cp * parwe) >> FRAC_BITS ]; 
        *sp++= *cp++ * totaly;
        *output++= 0xff;
      }
      output += output_rowdelta;
    }
    else
    {
      for(sp=sS,cp=cS;sp<sE;sp++, cp++) *sp += *cp * weight;
      totaly += weight;
    }
  }

  free(data);
}

// rgb -> rgba : skip the alpha part
void resize_24_32b(uint16_t weight,
                   int iW, int iH, uint8_t *input, int input_bpl,
                   uint8_t *output, int output_bpl)
{
  int oW = weight*iW >> FRAC_BITS;

  uint32_t data_size=   oW*3*sizeof(uint32_t) + 
                      2*iW*3*sizeof(uint16_t);
  uint8_t  *data= malloc(data_size);
  uint32_t *const sS= (void*) data,    *const sE= sS + 3*oW, *sp;
  uint16_t *const fS= (void*) sE,      *const fE= fS + 3*iW, *fp; 
  uint16_t *const cS= (void*) fE,      *cp;
  uint32_t  totaly, totalx;
  uint32_t  P0, P1, P2, pix0, pix1, pix2;
  const uint32_t input_rowdelta= input_bpl - 3*iW;
  const uint32_t output_rowdelta= output_bpl - 4*oW;

  init_tables();

  for(sp=sS;sp<sE;sp++) *sp= 0;
  for(totaly=0;iH;iH--)
  {
    for(fp=fS;fp<fE;fp++,input++) *fp= s2l[*input];
    input += input_rowdelta;
    for(fp=fS, totalx=pix0=pix1=pix2=0,cp=cS;
        fp<fE && (P0=*fp++,P1=*fp++,P2=*fp++,1);)
      if (totalx+weight>=FIXED_ONE)
      {
        *cp++= (pix0 + P0 * (FIXED_ONE-totalx)) >> FRAC_BITS; 
        *cp++= (pix1 + P1 * (FIXED_ONE-totalx)) >> FRAC_BITS; 
        *cp++= (pix2 + P2 * (FIXED_ONE-totalx)) >> FRAC_BITS; 
        totalx = totalx + weight - FIXED_ONE;
        pix0= totalx * P0;
        pix1= totalx * P1;
        pix2= totalx * P2;
      }
      else
      {
        pix0 += weight * P0;
        pix1 += weight * P1;
        pix2 += weight * P2;
        totalx += weight;
      }

    if (totaly+weight>=FIXED_ONE)
    {
      uint32_t parwe= FIXED_ONE-totaly;
      totaly = totaly + weight - FIXED_ONE;
      for(sp=sS,cp=cS;sp<sE;)
      {
        *output++= l2s[ (*sp + *cp * parwe) >> FRAC_BITS ]; 
        *sp++= *cp++ * totaly;
        *output++= l2s[ (*sp + *cp * parwe) >> FRAC_BITS ]; 
        *sp++= *cp++ * totaly;
        *output++= l2s[ (*sp + *cp * parwe) >> FRAC_BITS ]; 
        *sp++= *cp++ * totaly;
        output++;
      }
      output += output_rowdelta;
    }
    else
    {
      for(sp=sS,cp=cS;sp<sE;sp++, cp++) *sp += *cp * weight;
      totaly += weight;
    }
  }

  free(data);
}

int resize_fit_width(int iW, int iH, int reqW,
                      int *RoW, int *RoH, uint16_t *Rweight)
{
  int  weight;
  weight= (1<<FRAC_BITS) * (double) reqW / iW;
  if (!weight) return 1;
  while((weight*iW >> FRAC_BITS) < reqW) weight++;
  *Rweight= weight;
  *RoH= (weight*iH) >> FRAC_BITS;
  *RoW= (weight*iW) >> FRAC_BITS;
  return 0;
}
#undef FRAC_BITS 
#undef COLOR_BITS 
#undef FIXED_ONE 
#endif
/** } IMPLEMENTATION  */

#ifdef RESIZE_IMAGE_SELF_TEST
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <stdint.h>
#include "lodepng.h"

int main(int argc, char **argv)
{
  unsigned bigw, bigh;
  unsigned char *big;
  int smallw, smallh;
  uint8_t *out;
  uint16_t weight;

  smallw= atoi(argv[2]);
  if (smallw<0)
  {
    smallw= -smallw;
    if (lodepng_decode_file(&big, &bigw, &bigh, argv[1], LCT_GREY, 8))
      err(1,"load");
    if (resize_fit_width(bigw, bigh, smallw, &smallw, &smallh, &weight))
      err(3,"outofbounds");
    out= malloc(smallw*smallh);
    resize_single_channel(weight, bigw, bigh, big, out);
    if (lodepng_encode_file(argv[3], out, smallw, smallh, LCT_GREY, 8))
      err(2,"save");
  }
  else
  {
    if (lodepng_decode_file(&big, &bigw, &bigh, argv[1], LCT_RGB, 8))
      err(1,"load");
    if (resize_fit_width(bigw, bigh, smallw, &smallw, &smallh, &weight))
      err(3,"outofbounds");
    out= malloc(smallw*smallh*4);
    resize_24_32a(weight, bigw, bigh, big, bigw*3, out, smallw*4);
    if (lodepng_encode_file(argv[3], out, smallw, smallh, LCT_RGBA, 8))
      err(2,"save");
  }

  free(out);
  free(big);
  return 0;
}
#endif
/** } SELF_TEST **/
/** } RESIZE_IMAGE **/
