/* Conversion of files between different charsets and surfaces.
   Copyright © 1990,92,93,94,96,97,98,99,00 Free Software Foundation, Inc.
   Contributed by François Pinard <pinard@iro.umontreal.ca>, 1990.

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public License
   as published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This library is distributed in the hope that it will be
   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the `recode' Library; see the file `COPYING.LIB'.
   If not, write to the Free Software Foundation, Inc., 59 Temple Place -
   Suite 330, Boston, MA 02111-1307, USA.  */

#include "common.h"
#include "hash.h"

/* FIXME: Cleanup memory at end of job, and softly report errors.  */

/* The satisfactory aspects are that `recode' is now able to combine a set of
   sequence of UCS-2 characters into single codes, or explode those single
   codes into the original sequence.  It may happen that many sequences reduce
   to the same code, one of them is arbitrarily taken as canonical.  Any
   combining sequence of an equivalent set produces the equivalent code, but
   at explode time, codes are always turned into their canonical sequence.  I
   did not limit the length of a combining sequence, yet it is usually small.
   Also, I did not put any limit on the number of possibly equivalent
   sequences.  For combining, for each possible partial match in any sequence,
   there is a state.  States corresponding to the scan of a single code are
   found by hash coding, while other states are linearly searched on a linked
   list of possible shifts, rooted on the state corresponding to the partial
   sequence, but without its last code.  I expect this combination of methods
   to execute reasonably fast.  Exploding is much simpler, and I merely use
   hash coding.

   The less satisfactory aspects are that the user interface is still very
   rude.  For the time being, I merely added a "combined" charset: combining
   is done by `u2..co', exploding is done by `co..u2'.  We surely could do
   much better, and build on the concept of options (introduced by `+' in the
   request syntax), Options are probably going to be used for other
   transliteration matters.  Even if built on UCS-2, combining matters would
   have natural extensions in other _smaller_ charsets, and it would be nice
   being able to derive the functionality automatically.  One difficulty to
   foresee is that, contrarily to surfaces, options like transliteration or
   combining _require_ specific charsets (like UCS-2) as a vehicle, and if
   UCS-2 is not otherwise mandated in the request, we would have to create
   such an UCS-2 intermediate.  On the other hand, if UCS-2 is sandwidched
   somewhere in a complex series of steps, we should implicitly propagate
   options towards the UCS-2 step.  Even if nothing looks utterly difficult, I
   guess that for doing it right, a lot of attention and care is needed.
   Also, I'm pretty sure that scrutinizing the documents (from w3.org or
   Unicode) that were suggested to us will raise new specific concerns.  */

/* Exploding.  */

/*---------------------------.
| Initialize for exploding.  |
`---------------------------*/

static unsigned
combined_hash (const void *void_data, unsigned table_size)
{
  const unsigned short *data = void_data;

  return *data % table_size;
}

static bool
combined_compare (const void *void_first, const void *void_second)
{
  const unsigned short *first = void_first;
  const unsigned short *second = void_second;

  return *first == *second;
}

bool
init_explode (RECODE_STEP step,
	      RECODE_CONST_REQUEST request,
	      RECODE_CONST_OPTION_LIST before_options,
	      RECODE_CONST_OPTION_LIST after_options)
{
  const unsigned short *data = step->step_table;
  Hash_table *table;

  if (before_options || after_options)
    return false;

  table = hash_initialize (0, NULL, combined_hash, combined_compare, NULL);

  if (!table)
    return false;
  step->step_type = RECODE_EXPLODE_STEP;
  step->step_table = table;

  if (!data)
    return true;

  while (*data != DONE)
    {
      if (!hash_insert (table, data))
	return false;

      while (*data != DONE)
	data++;
      data++;
    }

  return true;
}

/*------------------------------------.
| Execute exploding transformations.  |
`------------------------------------*/

bool
explode_byte_byte (RECODE_SUBTASK subtask)
{
  Hash_table *table = subtask->step->step_table;
  unsigned value;

  while (value = get_byte (subtask), value != EOF)
    {
      unsigned short lookup = value;
      unsigned short *result = hash_lookup (table, &lookup);

      if (result)
	{
	  result++;
	  while (*result != DONE && *result != ELSE)
	    {
	      put_byte (*result, subtask);
	      result++;
	    }
	}
      else
	put_byte (value, subtask);
    }

  SUBTASK_RETURN (subtask);
}

bool
explode_ucs2_byte (RECODE_SUBTASK subtask)
{
  Hash_table *table = subtask->step->step_table;
  unsigned value;

  while (get_ucs2 (&value, subtask))
    {
      unsigned short lookup = value;
      unsigned short *result = hash_lookup (table, &lookup);

      if (result)
	{
	  result++;
	  while (*result != DONE && *result != ELSE)
	    {
	      put_byte (*result, subtask);
	      result++;
	    }
	}
      else
	put_byte (value, subtask);
    }

  SUBTASK_RETURN (subtask);
}

bool
explode_byte_ucs2 (RECODE_SUBTASK subtask)
{
  Hash_table *table = subtask->step->step_table;
  unsigned value;

  if (value = get_byte (subtask), value != EOF)
    {
      if (subtask->task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, subtask);

      while (true)
	{
	  unsigned short lookup = value;
	  unsigned short *result = hash_lookup (table, &lookup);

	  if (result)
	    {
	      result++;
	      while (*result != DONE && *result != ELSE)
		put_ucs2 (*result++, subtask);
	    }
	  else
	    put_ucs2 (value, subtask);

	  if (value = get_byte (subtask), value == EOF)
	    break;
	}
    }

  SUBTASK_RETURN (subtask);
}

bool
explode_ucs2_ucs2 (RECODE_SUBTASK subtask)
{
  Hash_table *table = subtask->step->step_table;
  unsigned value;

  if (get_ucs2 (&value, subtask))
    {
      if (subtask->task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, subtask);

      while (true)
	{
	  unsigned short lookup = value;
	  unsigned short *result = hash_lookup (table, &lookup);

	  if (result)
	    {
	      result++;
	      while (*result != DONE && *result != ELSE)
		put_ucs2 (*result++, subtask);
	    }
	  else
	    put_ucs2 (value, subtask);

	  if (!get_ucs2 (&value, subtask))
	    break;
	}
    }

  SUBTASK_RETURN (subtask);
}

/* Combining.  */

/* A combining state represents the history of reading one or more characters
   while forming a combining sequence.

   Hash coding is used to find an initial state from the first character in a
   sequence.  If that state is not found, the character surely does not start
   any sequence.  For subsequent characters, possible shifted states from the
   current state are kept on that state, as a linked chain.  This combination
   of methods is expected to behave fast enough in practice.  */

struct state
{
  unsigned short character;	/* last character seen to trigger this state */
  unsigned short result;	/* character equivalent to the combination */
  struct state *shift;		/* list of states for one more character */
  struct state *unshift;	/* state for one less character (back link) */
  struct state *next;		/* next state in a linked chain of states */
};

/*---------------------------.
| Initialize for combining.  |
`---------------------------*/

static unsigned
state_hash (const void *void_data, unsigned table_size)
{
  const struct state *data = void_data;

  return data->character % table_size;
}

static bool
state_compare (const void *void_first, const void *void_second)
{
  const struct state *first = void_first;
  const struct state *second = void_second;

  return first->character == second->character;
}

static void
state_free (void *void_state)
{
  struct state *state = void_state;
  struct state *shift = state->shift;

  while (shift != NULL)
    {
      struct state *next_shift = shift->next;
      free (shift);
      shift = next_shift;
    }
  free (state);
}

static struct state *
prepare_shifted_state (struct state *state, unsigned character,
		       RECODE_CONST_STEP step)
{
  if (state)
    {
      struct state *shift = state->shift;

      while (shift)
	if (shift->character == character)
	  return shift;
	else
	  shift = shift->next;

      if (shift = (struct state *) malloc (sizeof (struct state)), !shift)
	return NULL;

      shift->character = character;
      shift->result = NOT_A_CHARACTER;
      shift->shift = NULL;
      shift->unshift = state;
      shift->next = state->shift;
      state->shift = shift;
      return shift;
    }
  else
    {
      Hash_table *table = step->step_table;
      struct state lookup;

      lookup.character = character;
      state = hash_lookup (table, &lookup);
      if (!state)
	{
	  if (state= (struct state *) malloc (sizeof (struct state)), !state)
	    return NULL;

	  state->character = character;
	  state->result = character;
	  state->shift = NULL;
	  state->unshift = NULL;
	  state->next = NULL;

	  if (!hash_insert (table, state))
	    return NULL;
	}
      return state;
    }
}

static struct state *
find_shifted_state (struct state *state, unsigned character,
		    RECODE_CONST_STEP step)
{
  if (state)
    {
      struct state *shift = state->shift;

      while (shift)
	if (shift->character == character)
	  return shift;
	else
	  shift = shift->next;

      return NULL;
    }
  else
    {
      Hash_table *table = step->step_table;
      struct state lookup;

      lookup.character = character;
      return hash_lookup (table, &lookup);
    }
}

bool
init_combine (RECODE_STEP step,
	      RECODE_CONST_REQUEST request,
	      RECODE_CONST_OPTION_LIST before_options,
	      RECODE_CONST_OPTION_LIST after_options)
{
  const unsigned short *data = step->step_table;
  Hash_table *table;

  if (before_options || after_options)
    return false;

  table = hash_initialize (0, NULL, state_hash, state_compare, state_free);

  if (!table)
    return false;
  step->step_type = RECODE_COMBINE_STEP;
  step->step_table = table;

  if (!data)
    return true;

  while (*data != DONE)
    {
      unsigned short result = *data++;
      struct state *state = NULL;

      while (*data != DONE)
	if (*data == ELSE)
	  {
	    if (state)
	      {
		if (state->result != NOT_A_CHARACTER)
		  abort ();
		state->result = result;
		state = NULL;
	      }
	    data++;
	  }
	else
	  {
	    state = prepare_shifted_state (state, *data++, step);
	    if (!state)
	      return false;
	  }

      if (state)
	{
	  if (state->result != NOT_A_CHARACTER
	      && state->result != state->character)
	    abort ();
	  state->result = result;
	}
      data++;
    }

  return true;
}

/*------------------.
| Handle bactrack.  |
`------------------*/

/* When we cannot shift into a state allowing a terminal resulting character,
   we ought to backtrack until such a terminal character if found, then output
   this resulting character for representing the partial sequence which ends
   with that state.  Then, we merely copy characters seen after that state.

   This approach does not properly scan for combinings which might exist in
   the copied characters, presuming that this case does not occur in practice.
   If we later find that it does, backtracing will have to be revisited.  */

static void
backtrack_byte (struct state *state, RECODE_SUBTASK subtask)
{
  if (state->result == NOT_A_CHARACTER)
    {
      backtrack_byte (state->unshift, subtask);
      put_byte (state->character, subtask);
    }
  else
    put_byte (state->result, subtask);
}

static void
backtrack_ucs2 (struct state *state,  RECODE_SUBTASK subtask)
{
  if (state->result == NOT_A_CHARACTER)
    {
      backtrack_ucs2 (state->unshift, subtask);
      put_ucs2 (state->character, subtask);
    }
  else
    put_ucs2 (state->result, subtask);
}

/*------------------------------------.
| Execute combining transformations.  |
`------------------------------------*/

bool
combine_byte_byte (RECODE_SUBTASK subtask)
{
  struct state *state = NULL;
  unsigned value;

  if (value = get_byte (subtask), value != EOF)
    {
      while (true)
	{
	  struct state *shift
	    = find_shifted_state (state, value, subtask->step);

	  if (shift)
	    {
	      state = shift;
	      if (value = get_byte (subtask), value == EOF)
		break;
	    }
	  else if (state)
	    {
	      if (state->result == NOT_A_CHARACTER)
		backtrack_byte (state, subtask);
	      else
		put_byte (state->result, subtask);
	      state = NULL;
	    }
	  else
	    {
	      put_byte (value, subtask);
	      if (value = get_byte (subtask), value == EOF)
		break;
	    }
	}

      if (state)
	{
	  if (state->result == NOT_A_CHARACTER)
	    backtrack_byte (state, subtask);
	  else
	    put_byte (state->result, subtask);
	}
    }

  SUBTASK_RETURN (subtask);
}

bool
combine_ucs2_byte (RECODE_SUBTASK subtask)
{
  struct state *state = NULL;
  unsigned value;

  if (get_ucs2 (&value, subtask))
    {
      while (true)
	{
	  struct state *shift
	    = find_shifted_state (state, value, subtask->step);

	  if (shift)
	    {
	      state = shift;
	      if (!get_ucs2 (&value, subtask))
		break;
	    }
	  else if (state)
	    {
	      if (state->result == NOT_A_CHARACTER)
		backtrack_byte (state, subtask);
	      else
		put_byte (state->result, subtask);
	      state = NULL;
	    }
	  else
	    {
	      put_byte (value, subtask);
	      if (!get_ucs2 (&value, subtask))
		break;
	    }
	}

      if (state)
	{
	  if (state->result == NOT_A_CHARACTER)
	    backtrack_byte (state, subtask);
	  else
	    put_byte (state->result, subtask);
	}
    }

  SUBTASK_RETURN (subtask);
}

bool
combine_byte_ucs2 (RECODE_SUBTASK subtask)
{
  unsigned value;

  if (value = get_byte (subtask), value != EOF)
    {
      struct state *state = NULL;

      if (subtask->task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, subtask);

      while (true)
	{
	  struct state *shift
	    = find_shifted_state (state, value, subtask->step);

	  if (shift)
	    {
	      state = shift;
	      if (value = get_byte (subtask), value == EOF)
		break;
	    }
	  else if (state)
	    {
	      if (state->result == NOT_A_CHARACTER)
		backtrack_ucs2 (state, subtask);
	      else
		put_ucs2 (state->result, subtask);
	      state = NULL;
	    }
	  else
	    {
	      put_ucs2 (value, subtask);
	      if (value = get_byte (subtask), value == EOF)
		break;
	    }
	}

      if (state)
	{
	  if (state->result == NOT_A_CHARACTER)
	    backtrack_ucs2 (state, subtask);
	  else
	    put_ucs2 (state->result, subtask);
	}
    }

  SUBTASK_RETURN (subtask);
}

bool
combine_ucs2_ucs2 (RECODE_SUBTASK subtask)
{
  unsigned value;

  if (get_ucs2 (&value, subtask))
    {
      struct state *state = NULL;

      if (subtask->task->byte_order_mark)
	put_ucs2 (BYTE_ORDER_MARK, subtask);

      while (true)
	{
	  struct state *shift
	    = find_shifted_state (state, value, subtask->step);

	  if (shift)
	    {
	      state = shift;
	      if (!get_ucs2 (&value, subtask))
		break;
	    }
	  else if (state)
	    {
	      if (state->result == NOT_A_CHARACTER)
		backtrack_ucs2 (state, subtask);
	      else
		put_ucs2 (state->result, subtask);
	      state = NULL;
	    }
	  else
	    {
	      put_ucs2 (value, subtask);
	      if (!get_ucs2 (&value, subtask))
		break;
	    }
	}

      if (state)
	{
	  if (state->result == NOT_A_CHARACTER)
	    backtrack_ucs2 (state, subtask);
	  else
	    put_ucs2 (state->result, subtask);
	}
    }

  SUBTASK_RETURN (subtask);
}