/* File "tries.c":
 * Implements the data type "trie_t". */

/* This file is part of Malaga, a system for Left Associative Grammars.
 * Copyright (C) 1995-1998 Bjoern Beutel
 *
 * Bjoern Beutel
 * Universitaet Erlangen-Nuernberg
 * Abteilung fuer Computerlinguistik
 * Bismarckstrasse 12
 * D-91054 Erlangen
 * e-mail: malaga@linguistik.uni-erlangen.de 
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "basic.h"
#include "pools.h"

#undef GLOBAL
#define GLOBAL

#include "tries.h"

/*---------------------------------------------------------------------------*/

#define KEYS_MAX 255 /* the maximum number of keys in a node */

#define TRIE_DEPTH_MAX 50 /* maximum depth of trie, i.e. maximum key length */

typedef struct 
/* A full trie node can have up to KEYS_MAX keys.
 * It is only needed to build the trie. */
{
  u_byte_t prefix_length;
  char prefix[TRIE_DEPTH_MAX];
  u_byte_t number_of_keys;
  char key[KEYS_MAX];
  long_t subnode[KEYS_MAX];
  long_t content[KEYS_MAX];
} full_trie_node_t;

/* The trie is an array of compact trie nodes.
 * A compact trie node is stored in a pool of long_t and looks as follows:
 *
 * u_byte_t prefix_length;         (number of chars that precede the key)
 * char prefix[prefix_length]      (the chars that precede the key)
 * u_byte_t number_of_keys;        (the number of keys in this node)
 * char key[number_of_keys];       (the keys in this node)
 * 0..3 pad bytes of value 0       (to align next long_t)
 * long_t subnode[number_of_keys]; (indexes of the subnodes in the pool)
 * long_t content[number_of_keys]; (the contents associated with the keys) */

typedef union /* a type to access all data types a trie consists of. */
{
  long_t *long_p;     /* pointer to long_t */
  u_byte_t *u_byte_p; /* pointer to u_byte_t */
  char *char_p;       /* pointer to char */
} trie_ptr_t;

LOCAL full_trie_node_t path[TRIE_DEPTH_MAX]; 
LOCAL short_t path_index;
/* <path>[0..<path_index>] represents a path in the trie. The last key in
 * each node but the last has the next node as a subnode (its index can't
 * be entered in <subnode> because the subnode is not entered in the trie). */

/*---------------------------------------------------------------------------*/

LOCAL long_t store_trie_node (pool_t trie_pool, full_trie_node_t *node)
/* Store <node> in <trie_pool> and return its index. */
{
  long_t node_size;      /* size of the trie node in <trie_pool> */
  long_t node_index;     /* index of the trie node in <trie_pool> */
  long_t subnodes_index; /* index of subnode table relative to <node_index> */
  trie_ptr_t subnodes;   /* pointer to the subnode table in <trie_pool> */
  trie_ptr_t new_node, ptr;
  short_t i;

  subnodes_index = LONGS (1 + node->prefix_length + 1 + node->number_of_keys);
  node_size = subnodes_index + 2 * node->number_of_keys;
  new_node.long_p = (long_t *) get_pool_space (trie_pool, node_size, 
					       &node_index);
  subnodes.long_p = new_node.long_p + subnodes_index;

  ptr = new_node;

  /* Copy prefix. */
  *ptr.u_byte_p++ = node->prefix_length;
  for (i = 0; i < node->prefix_length; i++) 
    *ptr.char_p++ = node->prefix[i];

  /* Copy keys. */
  *ptr.u_byte_p++ = node->number_of_keys;
  for (i = 0; i < node->number_of_keys; i++) 
    *ptr.char_p++ = node->key[i];

  /* Pad with 0s up to subnode table. */
  while (ptr.char_p < subnodes.char_p) 
    *ptr.char_p++ = 0;

  /* Copy subnodes. */
  for (i = 0; i < node->number_of_keys; i++) 
    *ptr.long_p++ = node->subnode[i];

  /* Copy contents. */
  for (i = 0; i < node->number_of_keys; i++) 
    *ptr.long_p++ = node->content[i];

  return node_index;
}

/*---------------------------------------------------------------------------*/

LOCAL void store_path (pool_t trie_pool, short_t new_index)
/* Save <path>[<new_index>+1..<path_index>] in <trie_pool>. */
{
  while (path_index > new_index) 
  {
    full_trie_node_t *father_node;
    full_trie_node_t *node = path + path_index;
    short_t father_index, i;
    long_t trie_node;
    
    /* Go back to father, grandfather... of <node> as long as they have
     * only one subnode and no content entries.
     * We can add their keys to the prefix of <node> and delete them. */
    father_index = path_index;
    father_node = path + father_index;
    while (father_index > new_index) 
    {
      father_index--;
      father_node = path + father_index;
      
      if (father_node->number_of_keys != 1 
	  || father_node->content[0] != -1) 
	break;
    }
    /* <father_index> now points to the father that mustn't be deleted. */

    /* Concatenate the keys of all nodes to be deleted
     * and use them as as a prefix to <node>. */
    node->prefix_length = 0;
    for (i = father_index + 1; i < path_index; i++) 
    {
      full_trie_node_t *node_i = path + i;
      
      node->prefix[node->prefix_length++]
	= node_i->key[node_i->number_of_keys-1];
    }
    
    /* Enter the node into the trie. */
    trie_node = store_trie_node (trie_pool, node);
    
    /* Enter the subnode for the last key. */
    father_node->subnode[father_node->number_of_keys-1] = trie_node;
    
    /* Delete the nodes. */
    path_index = father_index;
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL void new_trie (long_t n, 
		      trie_entry_t *trie_entries, 
		      pool_t *trie_pool,
		      long_t *root_node)
/* Take <n> <trie_entries> and build a trie of them.
 * Return the trie in *<trie_pool> and *<root_node>.
 * The keys in *<trie_entries> must be unique and sorted alphabetically. */
{
  long_t i; /* number of the key that is being examined */

  *trie_pool = new_pool (sizeof (long_t));

  /* Enter the root node in the path. */
  path_index = 0;
  path[0].number_of_keys = 0;

  for (i = 0; i < n; i++) 
  {
    full_trie_node_t *node;
    string_t new_key;
    short_t new_index;
    
    new_key = trie_entries[i].key;
    
    /* Find node of largest index sharing common prefix with <new_key>. */
    for (new_index = 0; 
	 new_index < MIN ((short_t) strlen (new_key), path_index); 
	 new_index++) 
    {
      node = path + new_index;
      
      /* See if last key in that node is same as given key. */
      if (TO_LOWER (new_key[new_index]) 
	  != node->key[node->number_of_keys-1])
	break;
    }
    
    /* The path [0..new_index] can be used,
     * but path [new_index+1..path_index] must be stored in the trie. */
    store_path (*trie_pool, new_index);
    
    /* Add a new key to the last node in the path if necessary. */
    node = path + path_index;
    if (node->number_of_keys == 0 ||
	(node->key[node->number_of_keys-1]  != TO_LOWER (new_key[path_index])))
    {
      node->key[node->number_of_keys] = TO_LOWER (new_key[path_index]);
      node->subnode[node->number_of_keys] = -1;
      node->content[node->number_of_keys] = -1;
      node->number_of_keys++;
    }
    
    /* Generate new nodes with index up to the of length of the key. */
    while (new_key[path_index+1] != EOS) 
    {
      path_index++;
      if (path_index == TRIE_DEPTH_MAX)
	error ("trie key \"%s\" too long", new_key);
      
      path[path_index].number_of_keys = 1; 
      path[path_index].key[0] = TO_LOWER (new_key[path_index]);
      path[path_index].subnode[0] = -1;
      path[path_index].content[0] = -1;
    }
    
    /* Insert content entry at the end of the path. */
    node = path + path_index;
    node->content[node->number_of_keys-1] = trie_entries[i].content;
  }

  /* Store all nodes but the root node. */
  store_path (*trie_pool, 0);
  
  /* Store root node and save its index. */
  *root_node = store_trie_node (*trie_pool, path);
}

/*---------------------------------------------------------------------------*/

LOCAL bool_t lookup_trie_node (long_t *trie,
			       long_t *node,
			       string_t *input, 
			       long_t *content)
/* Test if a prefix of *<input> matches the node with index <node>.
 * If it does, return TRUE (else return FALSE) and:
 *   *<node> is the subnode for the matched input,
 *   *<input> is the pointer to first char not matched yet, and
 *   *<content> is the node content for the matched input. */
{
  trie_ptr_t trie_ptr;
  short_t prefix_length, number_of_keys, i;
  char *keys;
  long_t *contents;
  long_t *subnodes;
  char *key;

  if (*node == -1)
    return FALSE;

  /* Read parameters. */
  key = *input;
  trie_ptr.long_p = trie + *node;

  /* Test if node's prefix matches the given key. */
  prefix_length = *trie_ptr.u_byte_p++;
  for (i = 0; i < prefix_length; i++) 
  {
    if (TO_LOWER (*key) != *trie_ptr.char_p)
      return FALSE;
    key++; 
    trie_ptr.char_p++;
  }

  /* Get the rest of the node. */
  number_of_keys = *trie_ptr.u_byte_p++;
  keys = trie_ptr.char_p;
  subnodes = (long_t *) ALIGN (trie_ptr.char_p + number_of_keys);
  contents = (long_t *) (subnodes + number_of_keys);

  /* Look for a key that matches the next char of the given key. */
  for (i = 0; i < number_of_keys; i++) 
  {
    if (TO_LOWER (*key) == keys[i]) 
    {
      /* This entry matches. */
      *node = subnodes[i];
      *input = key + 1;
      *content = contents[i];
      return TRUE;
    }
  }

  /* No key matches. */
  return FALSE;
}

/*---------------------------------------------------------------------------*/

GLOBAL bool_t lookup_trie (long_t *trie,
			   long_t *node, 
			   string_t *input, 
			   long_t *content)
/* Test if a prefix of *<input> matches the subtrie with root index *<node>.
 * If it does, return TRUE (else return FALSE) and:
 *   *<node> is the subtrie for the matched input,
 *   *<input> is the pointer to first char not matched yet, and
 *   *<content> is the trie content for the matched input. */
{
  while (lookup_trie_node (trie, node, input, content)) 
  {
    if (*content != -1)
      return TRUE;
  }
  return FALSE;
}

/*---------------------------------------------------------------------------*/
