/*
 * Copyright (c) 1999-2000 by Simon St.Laurent.  All Rights Reserved.
 *
 * This program is open source software; you may use, copy, modify, and
 * redistribute it under the terms of the LICENSE with which it was
 * originally distributed.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * LICENSE for more details.
 */

//package org.simonstl.xml;


import java.io.*;

/**
 * <p>This class strips out the comment marks and the <!--[if...]><![endif]--> constructs from Microsoft Word 2000 HTML.  It also adds quotation marks to attributes when necessary and wraps scripts in CDATA sections.  A simple DOCTYPE declaration includes the entities needed.</p>

 *
 * @author Simon St.Laurent
 * @version $Date: 2000/07/27 $
 */
public final class O2KCleaner extends FilterReader
{
    public O2KCleaner (InputStream in) {
	this(new InputStreamReader(in));
    }

    public O2KCleaner (Reader in) {
	super(new BufferedReader(in));
    }

private boolean docStarted=false;
private int c;
private boolean generating=false;
private StringBuffer myBuffer=new StringBuffer();
private boolean inAttribute=false;
private boolean inElementName=false;
private boolean emptyElement=false;
private boolean inScript=false;
private boolean endScript=false;

  public int read() throws IOException {
    if (docStarted==false) {
	myBuffer.append("!DOCTYPE html [\n");
      myBuffer.append("<!ENTITY % lat1 SYSTEM 'http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent'>\n");
	myBuffer.append("%lat1;\n");
      myBuffer.append("<!ENTITY % special SYSTEM 'http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent'>\n");
	myBuffer.append("%special;\n");
      myBuffer.append("<!ENTITY % symbols SYSTEM 'http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent'>\n");
	myBuffer.append("%symbols;\n");
	myBuffer.append("]>\n");
	docStarted=true;
	c=60;
	} else {
	
      if ((generating==false) && (myBuffer.length()==0)) {
		c = in.read();
		if (c==45) {
		//knock off comment tails
			int d=in.read();
			myBuffer.append((char) d);
			if (d==45){
				int e=in.read();
				myBuffer.append((char) e);
				if (e==62){
					myBuffer.setLength(0);
				c=32;
				}
			}
		}

		if (c==60) {
			generating=true;

//2nd char handling
			int d=in.read();
			if ((d==47) && (inScript==true)) {
			//end script
			inScript=false;
			endScript=true;
			myBuffer.append("]><");
			}
			if (d==33) { //exclamation
				int e=in.read();
				switch ( e ){
				case (91):
//need to see if it's an if or endif.  not consistent in MS
				int eori=in.read();
				if (eori==101) {
					while (c!=62) {
						c=in.read();
					}
					myBuffer.setLength(0);
					myBuffer.append("</msc:if> ");
//msc is Microsoft Conditional
					c=32;
					generating=false;
				}
				if (eori==105) {
					myBuffer.append("<msc:if condition='");
					int h=in.read();
					h=in.read();
//					h=in.read();
					while (c!=62) {
						c=in.read();
						myBuffer.append((char)c);
						}
						myBuffer.setLength(myBuffer.length()-2);
						myBuffer.append("' xmlns:msc='http://simonstl.com/projects/o2k/'>");
//msc is Microsoft Conditional
					c=32;
					generating=false;
				}

				break;
				case (45):
					int f=in.read();
					int g=in.read();
					switch ( g ){
					case (91):
						//if
					myBuffer.append("<msc:if condition='");
					int h=in.read();
					h=in.read();
//					h=in.read();
					while (c!=62) {
						c=in.read();
						myBuffer.append((char)c);
						}
						myBuffer.setLength(myBuffer.length()-2);
						myBuffer.append("' xmlns:msc='http://simonstl.com/projects/o2k/'>");
						c=32;
						generating=false;
						break;
					default:
						//just a comment
						c=32; generating=false;
						break;
					
					}
				}
			} else {
//after 2nd char, normal element, or within script
			if (inScript==true) {
				generating=false;
			} else {
				inElementName=true;
			}
				emptyElement=false;
				myBuffer.append((char)d);
				while ((c!=62) && (generating==true)) {
					c=in.read();
					//check for end of element name
					if (((c==62) ||(c==32) || (c==13) || (c==10))&& (inElementName==true)) {
						inElementName=false;
						if ((myBuffer.toString().equals("meta")) ||
							(myBuffer.toString().equals("link")) ||
							(myBuffer.toString().equals("br"))) {
							emptyElement=true;
						}
						if (myBuffer.toString().equals("script")) {
							inScript=true;
						}

					}
					//check for end attributes
					if ((c==62) || (c==32)  || (c==13) || (c==10)) {
						if ((inAttribute==true)){
							String testString=myBuffer.toString();
							if (testString.indexOf("charset")==-1){
								myBuffer.append((char) 34);
							}
							inAttribute=false;
						}
					}
					if ((c==62) && (emptyElement==true)) {
						myBuffer.append((char) 47);
					}
					myBuffer.append((char)c);
					//insert CDATA start here
					if ((c==62) && (inScript==true)) {
						myBuffer.append("<![CDATA[");
					}

					//check for start attributes
					if (c==61) {
						d=in.read();
						if ((d!=34) && (d!=39)) {
							inAttribute=true;
							String testString=myBuffer.toString();
							if (testString.indexOf("charset")==-1){
								myBuffer.append((char) 34);
							}
						}
					myBuffer.append((char) d);
					}
				}
				generating=false;
				if (endScript==false)  {
					c=60;
				} else {
					c=93;
					endScript=false;
				}
			}
		}	
   	} else {
		if (generating==false) {
		//feed out the buffer a character at a time
		c=myBuffer.charAt(0);
		myBuffer.reverse();
		myBuffer.setLength(myBuffer.length()-1);
		myBuffer.reverse();
		} 
	}
     }
	return c;
  }
 
  


/**
This method is a placeholder - all 'real' activity appears in the int read() method.  This placeholder is substantially from Java I/O by Elliotte Rusty Harold, http://www.oreilly.com/catalog/javaio/.
*/
  private boolean endOfStream = false;
  
  public int read(char[] text, int offset, int length) throws IOException {
  
    if (endOfStream) return -1;
    int numRead = 0;
    
    for (int i = offset; i < offset+length; i++) {
      int temp = this.read();
      if (temp == -1) {
        this.endOfStream = true;
        break;
      }
      text[i] = (char) temp;
      numRead++;
    }
    return numRead;
  
  }



  /** This class defines a main() method to test the O2KCleaner */
public static void main(String[] args) {
      try {
        if (args.length != 1) 
          throw new IllegalArgumentException("Wrong number of arguments");
        // Create a stream to read and clean the file
        BufferedReader in = new BufferedReader(new O2KCleaner(new FileReader(args[0])));
        String line;
        while((line = in.readLine()) != null)
          System.out.println(line);
        in.close();  // Close the stream.
      }
      catch(Exception e) {
        e.printStackTrace();
        System.err.println("Usage: java O2KCleaner <filename>");
      }
    }
  


}
