/*
 * File........: c:/arc/prog/java/gk/util/sgml/SGMLScanner.java
 * Package.....: gk.util.sgml
 * Created.....: 98/08/18, Guido Krueger
 * RCS.........: $Revision: 1.2 $
 *               $Date: 1998/09/08 22:42:12 $ $Author: guido $
 *
 * Copyright (c) 1998 Guido Krueger. All Rights Reserved.
 *
 * Permission to use, copy, modify, and distribute this
 * software and its documentation for NON-COMMERCIAL purposes
 * and without fee is hereby granted provided that this
 * copyright notice appears in all copies.
 *
 * THE AUTHOR MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE 
 * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, 
 * INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR 
 * NON-INFRINGEMENT. THE AUTHOR SHALL NOT BE LIABLE FOR ANY 
 * DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING 
 * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
 *
 * <hr>
 * This class implements a simple SGML scanner which is able to
 * perform a lexical analysis of SGML documents. It has been created
 * from a number of semi-formal SGML documents and should provide
 * a working approach, though it may not always work perfect. To scan 
 * a given SGML text, the following steps have to be carried out:
 * <p>
 * 0. Create your own class which is derived from SGLScanner
 * 1. Create a Reader which could be used to read the data
 * 2. Call startScanner and pass the Reader as an argument
 * 3. startScanner will analyze the data and then call either of
 *    the following callback methods:
 *    - element() is called after an SGML element has been
 *      parsed (i.e. <h1> or <TITLE> or <a href="#xyz.html">). The
 *      element name and argument data is passed as an object of
 *      type SGMLElement.
 *    - special() is called after an element has been
 *      found that starts with "<!" (i.e. an SGML declaration, a
 *      comment, or similar). The complete element is transferred
 *      as a String argument.
 *    - entity() is called after an entity has been found
 *      (i.e. &uuml; or &amp;). The method is free to interpret
 *      the entity and should return the converted value. The
 *      standard method just returns the unchanged value.
 *    - pcData is called after a PCDATA sequence has been completely
 *      parsed (i.e. the code between two entities).
 *    - The init() and exit() methods are called once at the beginning and
 *      end of the lexical analysis. They could be used to execute 
 *      initialization / termination code.
 * 4. If there is no syntax error, the startScanner method will
 *    eventually terminate, otherwise an SGMLScannerException
 *    will be thrown.
 */
package gk.util.sgml;

import java.io.*;

public class SGMLScanner
{
  //Pseudo constants
  static String[] statenames = {
	"Neither inside Element nor Entity",
	"At the beginning of an Element",
	"At the beginning of a Special Element",
	"At the beginning of an end element",
	"Inside the name of an Element",
	"At the beginning of an argument",
	"Inside the name of an argument",
	"After the name of an argument",
	"Before the value of an argument",
	"Inside the value of an argument",
	"Inside the value of a quoted argument",
	"At the beginning of an Entity",
	"Before the first number of a # Entity",
	"Inside the number of a # Entity",
	"Inside a non-# Entity"
  };

  //Instance variables
  boolean stopRequested;
  StringBuffer buffer;

  /**
   * The constructor actually does nothing.
   */
  public SGMLScanner()
  {
  }

  /**
   * Starts scanning the SGML data. This method should be called to
   * actually start the lexical analysis of the SGML document.
   *
   * @param      f1 Reader with input data.
   * @exception  IOException if there is an I/O error with the
   *             reader. SGMLScannerException if a syntactical
   *             errors occurs during scanning.
   */
  public void startScanner(Reader f1)
  throws IOException, SGMLScannerException
  {
	int state = 1;
	int c;
	boolean accepted;
	SGMLElement element;
	StringBuffer argname, argvalue, entitybuffer;

	stopRequested = false;
	buffer        = new StringBuffer(200);
	argname       = new StringBuffer(100);
	argvalue      = new StringBuffer(200);
	entitybuffer  = new StringBuffer(100);
	element       = new SGMLElement();
	c = 0;
	init();
	while (c != -1 && !stopRequested) {
	  c = f1.read();
	  //System.out.println("state="+state+" char="+(char)c);
	  accepted = false;
	  switch (state) {
	  case 1: //---State 1: Neither inside Element nor Entity
		if (c == '<') { //Element
		  if (buffer.length() > 0) {
			pcData(buffer.toString());
		  }
		  buffer.setLength(1);
		  buffer.setCharAt(0,(char)c);
		  state = 2;
		} else if (c == '&') { //Entity
		  entitybuffer = new StringBuffer();
		  entitybuffer.append((char)c);
		  state = 12;
		} else if (c == -1) { //EOF
		  if (buffer.length() > 0) {
			pcData(buffer.toString());
		  }
		} else { //Nothing applicable
		  buffer.append((char)c);
		}
		accepted = true;
		break;
	  case 2: //---State 2: At the beginning of an Element
		accepted = true;
		buffer.append((char)c);
		if (c == '!') { //Special Element
		  state = 3;
		} else if (c == '/') { //Terminating Element
		  state = 4;
		} else if (Character.isLetterOrDigit((char)c)) { //Element
		  element = new SGMLElement();
		  element.setTerminating(false);
		  element.appendToName((char)c);
		  state = 5;
		} else { //Nothing applicable
		  accepted = false;
		}
		break;
	  case 3: //---State 3: At the beginning of a Special Element
		accepted = true;
		buffer.append((char)c);
		if (c == '>') {
		  special(buffer.toString());
		  buffer.setLength(0);
		  state = 1;
		}
		break;
	  case 4: //---State 4: At the beginning of an end element
		buffer.append((char)c);
		accepted = true;
		if (Character.isLetterOrDigit((char)c)) {
		  element = new SGMLElement();
		  element.setTerminating(true);
		  element.appendToName((char)c);
		  state = 5;
		} else { //Nothing applicable
		  accepted = false;
		}
		break;
	  case 5: //---State 5: Inside the name of an Element
		buffer.append((char)c);
		accepted = true;
		if (c == '>') {
		  element(element);
		  buffer.setLength(0);
		  state = 1;
		} else if (Character.isWhitespace((char)c)) {
		  state = 6;
		} else if (Character.isLetterOrDigit((char)c)) {
		  element.appendToName((char)c);
		} else { //Nothing applicable
		  accepted = false;
		}
		break;
	  case 6: //---State 6: At the beginning of an argument
		buffer.append((char)c);
		accepted = true;
		if (c == '>') {
		  element(element);
		  buffer.setLength(0);
		  state = 1;
		} else if (Character.isWhitespace((char)c)) {
		  //stay here and wait for next char
		} else if (Character.isLetterOrDigit((char)c)) {
		  argname = new StringBuffer();
		  argvalue = new StringBuffer();
		  argname.append((char)c);
		  state = 7;
		} else {
		  accepted = false;
		}
		break;
	  case 7: //---State 7: Inside the name of an argument
		buffer.append((char)c);
		accepted = true;
		if (c == '>') {
		  element.addArgument(argname, argvalue, false);
		  element(element);
		  buffer.setLength(0);
		  state = 1;
		} else if (c == '=') {
		  state = 9;
		} else if (Character.isWhitespace((char)c)) {
		  state = 8;
		} else if (Character.isLetterOrDigit((char)c)) {
		  argname.append((char)c);
		} else if (c == '-') {
		  argname.append((char)c);
		} else {
		  accepted = false;
		}
		break;
	  case 8: //---State 8: After the name of an argument
		buffer.append((char)c);
		accepted = true;
		if (c == '>') {
		  element.addArgument(argname, argvalue, false);
		  element(element);
		  buffer.setLength(0);
		  state = 1;
		} else if (c == '=') {
		  state = 9;
		} else if (Character.isWhitespace((char)c)) {
		  //stay here and wait for next char
		} else if (Character.isLetterOrDigit((char)c)) {
		  element.addArgument(argname, argvalue, false);
		  argname = new StringBuffer();
		  argvalue = new StringBuffer();
		  argname.append((char)c);
		  state = 7;
		} else {
		  accepted = false;
		}
		break;
	  case 9: //---State 9: Before the value of an argument
		buffer.append((char)c);
		accepted = true;
		if (c == '>') {
		  element.addArgument(argname, argvalue, true);
		  element(element);
		  buffer.setLength(0);
		  state = 1;
		} else if (c == '\"') {
		  argvalue.append((char)c);
		  state = 11;
		} else if (Character.isWhitespace((char)c)) {
		  //stay here and wait for next char
		} else {
		  argvalue.append((char)c);
		  state = 10;
		}
		break;
	  case 10: //---State 10: Inside the value of an argument
		buffer.append((char)c);
		accepted = true;
		if (c == '>') {
		  element.addArgument(argname, argvalue, true);
		  element(element);
		  buffer.setLength(0);
		  state = 1;
		} else if (Character.isWhitespace((char)c)) {
		  element.addArgument(argname, argvalue, true);
		  state = 6;
		} else {
		  argvalue.append((char)c);
		}
		break;
	  case 11: //---State 11: Inside the value of a quoted argument
		buffer.append((char)c);
		accepted = true;
		argvalue.append((char)c);
		if (c == '\"') {
		  element.addArgument(argname, argvalue, true);
		  state = 6;
		}
		break;
	  case 12: //---State 12: At the beginning of an Entity
		entitybuffer.append((char)c);
		accepted = true;
		if (c == '#') {
		  state = 13;
		} else if (Character.isLetterOrDigit((char)c)) {
		  state = 15;
		} else {
		  accepted = false;
		}
		break;
	  case 13: //---State 13: Before the first number of a # Entity
		entitybuffer.append((char)c);
		accepted = true;
		if (Character.isDigit((char)c)) {
		  state = 14;
		} else {
		  accepted = false;
		}
		break;
	  case 14: //---State 14: Inside the number of a # Entity
		entitybuffer.append((char)c);
		accepted = true;
		if (Character.isDigit((char)c)) {
		  //stay here and wait for next digit
		} else if (c == ';') {
		  buffer.append(entity(entitybuffer.toString()));
		  state = 1;
		} else {
		  accepted = false;
		}
		break;
	  case 15: //---State 15: Inside a non-# Entity
		entitybuffer.append((char)c);
		accepted = true;
		if (Character.isLetterOrDigit((char)c)) {
		  //stay here and wait for next digit
		} else if (c == ';') {
		  buffer.append(entity(entitybuffer.toString()));
		  state = 1;
		} else {
		  accepted = false;
		}
		break;
	  default: //---Default: Unknown state error
		accepted = false;
		break;
	  }
	  if (!accepted) {
		//---Throw SGMLScannerException
		String msg = "Invalid character " + c;
		if (c >= 32) {
		  msg += " (\'" + (char)c + "\')";
		}
		msg += " in state " + state;
		if (state >= 1 && state <= 15) {
		  msg += " (" + statenames[state - 1] + ")";
		} else {
		  msg += " (unknown)";
		}
		msg += ". Buffer: " + buffer.toString();
		throw new SGMLScannerException(msg);
	  }
	}
	exit();
  }

  /**
   * The init method will be called during the initialization of the 
   * startScanner method. It can be overloaded if the derived class wants
   * to execute specific initialization code.
   */
  protected void init()
  {
  }

  /**
   * This method can be called from any of the callback methods to interrupt
   * the scanner.
   */
  protected void stopScanner()
  {
	stopRequested = true;
  }

  /**
   * Returns the complete contents of an element, a special element
   * or an entity after the corresponding callback method has
   * been called.
   *
   * @return     The contents as described above.
   */
  protected String getBuffer()
  {
	return buffer.toString();
  }

  /**
   * This method is called whenever an element has been completely 
   * parsed. The name of the element is already converted to uppercase
   * letters.
   */
  protected void element(SGMLElement element)
  {
  }

  /**
   * This method is called whenever a special element has been
   * completely parsed.
   */
  protected void special(String content)
  {
  }

  /**
   * This method is called whenever an entity has been parsed. It should
   * return either the converted or unchanged entity as a String.
   */
  protected String entity(String name)
  {
	return name;
  }

  /**
   * This callback method is called whenever a PCDATA sequence has been
   * found. Any entities are already converted using the return value of
   * the entity() method.
   */
  protected void pcData(String data)
  {
  }

  /**
   * Could be called to execute code after the program finished scanning
   * the source code.
   */
  protected void exit()
  {
  }
}
