/*
 * File........: c:/ARC/PROG/java/gk/lscript/Scanner.java
 * Package.....: gk.lscript
 * Created.....: 98/08/06, Guido Krueger
 * RCS.........: $Revision: 1.6 $
 *               $Date: 1998/08/15 01:17:36 $ $Author: guido $
 *
 * Copyright (c) 1998 Guido Krueger. All Rights Reserved.
 *
 * Permission to use, copy, modify, and distribute this
 * software and its documentation for NON-COMMERCIAL purposes
 * and without fee is hereby granted provided that this
 * copyright notice appears in all copies.
 *
 * THE AUTHOR MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE 
 * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, 
 * INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR 
 * NON-INFRINGEMENT. THE AUTHOR SHALL NOT BE LIABLE FOR ANY 
 * DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING 
 * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
 */
package gk.lscript;

import java.io.*;

/**
 * Lexical analyser for the LScript language. The following lexical 
 * elements are recognized:
 * <ul>
 * <li>whole number:        [0-9]+
 * <li>floating point:      as defined in java.lang.Double
 * <li>string:              ".*" (must not contain " itself)
 * <li>lparen:              (
 * <li>rparen:              )
 * <li>quote:               ['~]
 * <li>comment:             ;.*[$;]
 * <li>whitespace:          [\t\r\n ]
 * <li>delimiter:           [()\t\r\n ]  (note: not ['";])
 * </ul>
 */
public class Scanner
{
  //Constants-------------------------------------------------
  public static final int INTEGER    =  1;
  public static final int DOUBLE     =  2;
  public static final int STRING     =  3;
  public static final int IDENTIFIER =  4;
  public static final int LPAREN     =  5;
  public static final int RPAREN     =  6;
  public static final int QUOTE      =  7;
  public static final int COMMENT    =  8;
  public static final int ERROR      = 91;
  public static final int NOSYMBOL   = 92;
  public static final int EOF        = 99;

  private static int BUFCAP = 1000;

  private static int STATE_INITIAL      = 1;
  private static int STATE_NUMBER       = 2;
  private static int STATE_MINUS        = 3;
  private static int STATE_STRING       = 4;
  private static int STATE_IDENTIFIER   = 5;
  private static int STATE_COMMENT      = 6;

  //Instance variables----------------------------------------
  private Reader       input;
  private int          linenum;
  private char[]       buf;
  private int          buflen;
  private int          bufpos;
  private int          state;
  private String       error;
  private String       stringdata;
  private double       doubledata;
  private int          intdata;
  private StringBuffer data;

  /**
   * Creates a new LScript scanner. The input argument provides a Reader
   * with the LScript code.
   */
  public Scanner(Reader input)
  {
	this.input = input;
	linenum = 1;
	buf = new char[BUFCAP + 5];
	buflen = 0;
	bufpos = 0;
	data = new StringBuffer(100);
  }

  /**
   * Returns the next lexical symbol in the input according to the following
   * table:
   * <pre>
   * Return       kind of token            retrieve token data using
   * ----------------------------------------------------------------------
   * INTEGER      Whole number             getIntData()
   * DOUBLE       Floating point           getDoubleData()
   * STRING       String                   getStringData()
   * IDENTIFIER   Identifier               getStringData()
   * LPAREN       (                        --
   * RPAREN       )                        --
   * QUOTE        ' or ~                   --
   * COMMENT      Comment                  getStringData()
   * ERROR        Lexical error            getError(), getLine()
   * NOSYMBOL     Internal error           --
   * EOF          End of input reached     --
   * ----------------------------------------------------------------------
   * </pre>
   * 
   * After either of ERROR, EOF or NOSYMBOL has been returned, there 
   * should be no further calls to nextSym().
   */
  public int nextSym()
  {
	int ret = NOSYMBOL;
	boolean stop = false;
	boolean wholenumber = true;
	data.setLength(0);
	state = STATE_INITIAL;
	while (!stop) {
	  //fill buffer
	  if (bufpos >= buflen) {
		try {
		  buflen = input.read(buf, 0, BUFCAP);
		} catch (IOException e) {
		  buflen = -1;
		}
		bufpos = 0;
	  }
	  if (buflen == -1 || bufpos >= buflen) { //no more characters
		if (state == STATE_INITIAL) {
		  ret = EOF;
		} else if (state == STATE_NUMBER) {
		  ret = makeNumber(wholenumber);
		} else if (state == STATE_IDENTIFIER) {
		  stringdata = data.toString();
		  ret = IDENTIFIER;
		} else if (state == STATE_COMMENT) {
		  stringdata = data.toString();
		  ret = COMMENT;
		} else {
		  ret = ERROR;
		  error = "Unexpected end of input";
		}
		stop = true;
	  } else {
		//scan next  token
		char c = buf[bufpos++];
		if (state == STATE_INITIAL) {
		  if (c == '(') {
			ret = LPAREN;
			stop = true;
		  } else if (c == ')') {
			ret = RPAREN;
			stop = true;
		  } else if (c == '\'' || c == '~') {
			ret = QUOTE;
			stop = true;
		  } else if (c >= '0' && c <= '9') {
			data.append(c);
			state = STATE_NUMBER;
		  } else if (c == '-') {
			state = STATE_MINUS;
		  } else if (c == '\"') {
			state = STATE_STRING;
		  } else if (c == ';') {
			state = STATE_COMMENT;
		  } else if (isIdentifier(c)) {
			data.append(c);
			state = STATE_IDENTIFIER;
		  } else if (isWhiteSpace(c)) {
			if (c == '\n') {
			  ++linenum;
			}
		  } else {
			stop = true;
			ret = ERROR;
			error = "invalid character: " + c;
		  }
		} else if (state == STATE_NUMBER) {
		  if ((c >= '0' && c <= '9') || c == '-') {
			data.append(c);
		  } else if (c == 'e' || c == 'E' || c == '.') {
			data.append(c);
			wholenumber = false;
		  } else if (isDelimiter(c)) {
			--bufpos; //write back current char
			ret = makeNumber(wholenumber);
			stop = true;
		  } else {
			stop = true;
			ret = ERROR;
			error = "invalid character: " + c;
		  }
		} else if (state == STATE_MINUS) {
		  if (c >= '0' && c <= '9') {
			state = STATE_NUMBER;
			data.append('-');
			data.append(c);
		  } else {
			state = STATE_IDENTIFIER;
			data.append('-');
			--bufpos; //write back current char
		  }
		} else if (state == STATE_STRING      ) {
		  if (c == '\"') {
			stringdata = data.toString();
			ret = STRING;
			stop = true;
		  } else if (c == '\r' || c == '\n') {
			ret = ERROR;
			stop = true;
			error = "unexpected end of line in string";
		  } else {
			data.append(c);
		  }
		} else if (state == STATE_IDENTIFIER  ) {
		  if (isIdentifier(c)) {
			data.append(c);
		  } else if (isDelimiter(c)) {
			stringdata = data.toString();
			ret = IDENTIFIER;
			stop = true;
			--bufpos; //write back current char
		  } else {
			ret = ERROR;
			stop = true;
			error = "invalid character in identifier";
		  }
		} else if (state == STATE_COMMENT     ) {
		  if (c == '\r' || c == '\n' || c == ';') {
			stringdata = data.toString();
			ret = COMMENT;
			stop = true;
		  } else {
			data.append(c);
		  }
		} else {
		  stop = true;
		  ret = ERROR;
		  error = "internal error: unknown state " + state;
		}
	  }
	}
	return ret;
  }

  /**
   * Returns data in case of STRING, IDENTIFIER or COMMENT token.
   */
  public String getStringData()
  {
	return stringdata;
  }

  /**
   * Returns data in case of DOUBLE token.
   */
  public double getDoubleData()
  {
	return doubledata;
  }

  /**
   * Returns data in case of INTEGER token.
   */
  public int getIntData()
  {
	return intdata;
  }

  /**
   * Returns current line number.
   */
  public int getLine()
  {
	return linenum;
  }

  /**
   * Returns error message in case of ERROR token.
   */
  public String getError()
  {
	return error;
  }

  /**
   * Returns the symbol as a String.
   */
  public String getSymbolName(int symbol) 
  {
	String ret = "unknown";
	switch (symbol) {
	case INTEGER:
	  ret = "INTEGER";
	  break;
	case DOUBLE:
	  ret = "DOUBLE";
	  break;
	case STRING:
	  ret = "STRING";
	  break;
	case IDENTIFIER:
	  ret = "IDENTIFIER";
	  break;
	case LPAREN:
	  ret = "LPAREN";
	  break;
	case RPAREN:
	  ret = "RPAREN";
	  break;
	case QUOTE:
	  ret = "QUOTE";
	  break;
	case COMMENT:
	  ret = "COMMENT";
	  break;
	case ERROR:
	  ret = "ERROR";
	  break;
	case NOSYMBOL:
	  ret = "NOSYMBOL";
	  break;
	case EOF:
	  ret = "EOF";
	  break;
	}
	return ret;
  }

  //Private methods----------------------------------------
  private boolean isWhiteSpace(char c)
  {
	return c == ' ' || c == '\t' || c == '\n' || c == '\r';
  }

  private boolean isIdentifier(char c)
  {
	if (c >= 'a' && c <= 'z') return true;
	if (c >= 'A' && c <= 'Z') return true;
	if (c >= '0' && c <= '9') return true;
	return "+-*/%<>=^_&|.".indexOf(c) != -1;
  }

  private boolean isDelimiter(char c)
  {
	return c == '(' || c == ')' || isWhiteSpace(c);
  }

  private int makeNumber(boolean wholenumber)
  {
	int ret = 0;
	try {
	  if (wholenumber) {
		intdata = Integer.parseInt(data.toString());
		ret = INTEGER;
	  } else {
		doubledata = Double.valueOf(data.toString()).doubleValue();
		ret = DOUBLE;
	  }
	} catch (NumberFormatException e) {
	  ret = ERROR;
	  error = e.toString();
	}
	return ret;
  }

  //---main method for testing only----------------------------------------
  public static void main(String args[])
  {
	if (args.length != 1) {
	  System.out.println("usage: java Scanner (lisp-prog | @lisp-file)");
	  System.exit(1);
	}
	Scanner scanner = null;
	if (args[0].startsWith("@")) {
	  try {
		scanner = new Scanner(new FileReader(args[0].substring(1)));
	  } catch (FileNotFoundException e) {
		System.out.println(e.toString());
		System.exit(1);
	  }
	} else {
	  scanner = new Scanner(new StringReader(args[0]));
	}
	while (true) {
	  int sym = scanner.nextSym();
	  if (sym == INTEGER) {
		System.out.println("<integer: " + scanner.getIntData() + ">");
	  } else if (sym == DOUBLE) {
		System.out.println("<double: " + scanner.getDoubleData() + ">");
	  } else if (sym == STRING) {
		System.out.println("<string: \"" + scanner.getStringData() + "\">");
	  } else if (sym == IDENTIFIER) {
		System.out.println("<identifier: " + scanner.getStringData() + ">");
	  } else if (sym == LPAREN) {
		System.out.println("<lparen>");
	  } else if (sym == RPAREN) {
		System.out.println("<rparen>");
	  } else if (sym == QUOTE) {
		System.out.println("<quote>");
	  } else if (sym == COMMENT) {
		System.out.println("<comment: " + scanner.getStringData() + ">");
	  } else if (sym == ERROR     ) {
		System.out.println(
          "<error(" + scanner.getLine() + "): " + scanner.getError() + ">"
		);
		break;
	  } else if (sym == NOSYMBOL  ) {
		System.out.println("<nosymbol>");
		break;
	  } else if (sym == EOF       ) {
		System.out.println("<eof>");
		break;
	  } else {
		System.out.println("unknown symbol: " + sym);
	  }
	}
  }
}
