/*
 * ASTL - the Automaton Standard Template Library.
 * C++ generic components for Finite State Machine handling.
 * Copyright (C) 2000 Vincent Le Maout (vlemaout@lexiquest.fr).
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */


// File : tokenizer.h
// dictionary must be a dfa hash
// tokenizer maps a char stream to an int stream using a hash dictionary
// the words not in the dictionary map to zero

#include <iostream>
#include <iterator>
#include <cursor.h>
#include <vector>

using namespace std;

template <class dictionary, class InputIterator>
class tokenizer : public iterator<input_iterator_tag, int>
{
private:
  dictionary           &dico;
  int                  token, next_token;
	D_cursor<dictionary> cursor;
	InputIterator        i, eof;
  vector<char>         buffer, snd_buffer;
  vector<char>::size_type snd_iterator;
	vector<char>::size_type next_token_start, next_token_end,
                          current_token_start, current_token_end;
  unsigned long offset;
  bool EndOfFile;

  tokenizer(dictionary &d, InputIterator first, InputIterator last)
    : dico(d), cursor(d), i(first), eof(last), token(0), next_token(0),
      token_start(0), next_token_start(0), next_token_end(0),
      offset(0)
  {
    buffer.reserve(24576);
    snd_buffer.reserve(24576);
    snd_iterator = 0;
    EndOfFile = end_of_file();
    if (!end_of_file()) advance();
  }

  tokenizer(dictionary &d) : d(dico), EndOfFile(true) { }

  char next_char()
  {
    if (snd_iterator != snd_buffer.size())
      return (snd_buffer[snd_iterator++]);
    else
      return (*i++);
  }

  bool end_of_file() {
    return (i == eof && snd_iterator == snd_buffer.size());
  }

	int advance()
	{
		buffer.clear();
    token_start = next_token_start = next_token_end = 0;
    token = next_token = -1;

		// Avance jusqu'au plus long match
		for(cursor = dico.initial(), buffer.push_back(next_char()); 
				;
				buffer.push_back(next_char()))
		{
			if (!cursor.forward(buffer.back()))
        if (next_token_end != 0)
          break;
        else
				  next_token_start = buffer.size() - 1;
			else
				if (dico.final(cursor)) 
        {
          next_token_end = buffer.size();
          next_token = dico.tag(cursor);
          if (dico.delta2(cursor).empty()) break;
        }
			if ((EndOfFile = end_of_file())) break;
		}
    
    if (next_token_end != buffer.size())
    {
      snd_buffer.erase(buffer.begin(), buffer.begin() + snd_iterator);
      snd_buffer.insert(snd_buffer.end(), buffer.begin() + next_token_end, buffer.end());
      snd_iterator = 0;
    }
       
    if (next_token_start != 0) // mot inconnu avant ?
    {
      current_token_start = 0;
      current_token_end = next_token_start;
      token = 0;
    }
    else
    {
      token = next_token;
      current_token_start = next_token_start;
      current_token_end = next_token_end;
      next_token = -1;
    }
  }

  friend bool operator == (const tokenizer &x, const tokenizer &y) {
    return (x.EndOfFile == y.EndOfFile);
  }

  int operator * () const {
    return (token);
  }

  tokenizer& operator ++ ()
  {
    if (next_token != -1)
    {
      token = next_token;
      next_token = -1;
      current_token_start = next_token_start;
      current_token_end = next_token_end;
    }
    else
      advance();
    return (*this);
  }
  
  tokenizer operator ++ (int)
  {
    tokenizer tmp = *this;
    ++(*this);
    return (tmp);
  }

  pair<vector<char>::iterator, vector<char>::iterator> word() const {
    return (make_pair(current_token_start, current_token_end));
  }
};
  /*        
void advance()
  {
    w.clear();
    next_w.clear();
    unsigned long start_in = offset;
    unsigned long word_start = start_in;

    while(!in.eof())
    {
      ++offset;
      char c = in.get();
      w.push_back(c);
      if (!cursor.forward(c))
      {
        word_start = offset;
        if (cursor != dico.initial())
        {
          in.putback(c);
          w.pop_back();
          cursor = dico.initial();
          cursor.reset();
          --offset;
        }
      }
      else              // forwarding
        if (dico.final(cursor)) break;
    }
    eof = in.eof();
    if (dico.final(cursor))
    {
      if (word_start != start_in)
      {
        copy(w.begin() + word_start - start_in, w.end(), back_inserter(next_w));
        w.resize(word_start - start_in);
        token = 0;
        next_token = cursor.hash();
      }
      else
        token = cursor.hash();
    }
    else
      token = 0;   // no match && end_of_file
  }

public:
  tokenizer(istream &i, dictionary &d)
    : dico(d), token(-1), next_token(-1), 
      in(i), cursor(dico), eof(in.eof()), offset(0) 
  {
    cursor = dico.initial();
    cursor.reset();
    w.reserve(256);
    next_w.reserve(256);
    if (!eof)
      advance();
  }
  
  tokenizer(dictionary &d)     // end_of_file
    : dico(d), token(-1), next_token(-1), 
      in(cin), cursor(dico), eof(true)
  { }
  
};
*/