/*
 * Copyright 2000 by Simon St.Laurent.  All Rights Reserved.
 *
 * This program is open source software; you may use, copy, modify, and
 * redistribute it under the terms of the LICENSE with which it was
 * originally distributed.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * LICENSE for more details.
 */

//package org.simonstl.xml;


import java.io.*;

/**
 * <p>This class strips out the comment marks and the 
 * &lt;!--[if...]&gt;&lt;![endif]--&gt; constructs from 
 * Microsoft Word 2000 HTML.  It also adds quotation marks to 
 * attributes when necessary and wraps scripts in CDATA sections.  
 * A simple DOCTYPE declaration includes the entities needed.</p>
 *
 * Many thanks to Elliotte Rusty Harold for his substantial readability
 * cleanup and his finding (and fixing) an encoding-related problem. 
 *
 * @author Simon St.Laurent
 * @version 0.03 $Date: 2000/08/03 $
 */
public final class O2KCleaner extends FilterReader
{
    public O2KCleaner (InputStream in) {
      this(new InputStreamReader(in));
    }

    public O2KCleaner (Reader in) {
      super(new BufferedReader(in));
    }

    protected boolean docStarted    = false;
    protected boolean generating    = false;
    protected boolean inAttribute   = false;
    protected boolean inElementName = false;
    protected boolean emptyElement  = false;
    protected boolean inScript      = false;
    protected boolean endScript     = false;
    protected String encoding		= "ISO-8859-1";
    protected StringBuffer myBuffer = new StringBuffer();

    /** returns the encoding that will be put into the XML declaration.
	  ISO-8859-1 is the default.
    */

    public String getEncoding() {
	  return encoding;
    }
    /** allows you to set the encoding that will be put into the XML declaration.
	  ISO-8859-1 is the default.
    */
    public void setEncoding(String newEncoding) {
	  encoding=newEncoding;
    }


    protected int startDoc(){
        myBuffer.append("?xml version=\"1.0\" encoding=\"");
	  myBuffer.append(encoding);
	  myBuffer.append("\"?>\r\n");
        myBuffer.append("<!DOCTYPE html [\r\n");
        myBuffer.append(
         "<!ENTITY % lat1 SYSTEM 'http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent'>\r\n");
        myBuffer.append("%lat1;\r\n");
        myBuffer.append(
         "<!ENTITY % special SYSTEM 'http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent'>\r\n");
        myBuffer.append("%special;\r\n");
        myBuffer.append(
         "<!ENTITY % symbols SYSTEM 'http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent'>\r\n");
        myBuffer.append("%symbols;\r\n");
        myBuffer.append("]>\r\n");
        docStarted=true;
        return '<';
    }

    protected int commentTailRemove() throws IOException {
        //knock off comment tails
        int d=in.read();
        myBuffer.append((char) d);
        if (d=='-'){
            int e=in.read();
            myBuffer.append((char) e);
            if (e=='>'){
                myBuffer.setLength(0);
                return ' ';
            }
        }
        return '-';
    }

    protected int biConditional () throws IOException {
        
        int retChar=' ';
        //need to see if it's an if or endif.  not consistent in MS
        int eori=in.read();
        if (eori=='e') {
            while (retChar!='>') {
                retChar=in.read();
            }
            myBuffer.setLength(0);
            myBuffer.append("</msc:if> ");
            //msc is Microsoft Conditional
            retChar=' ';
            generating=false;
        }
        if (eori=='i') {
              myBuffer.append("<msc:if condition='");
              int h=in.read();
              h=in.read();
              while (retChar!='>') {
                  retChar=in.read();
                  myBuffer.append((char)retChar);
              }
              myBuffer.setLength(myBuffer.length()-2);
              myBuffer.append(
              "' xmlns:msc='http://simonstl.com/projects/o2k/'>");
              //msc is Microsoft Conditional
              retChar=' ';
              generating=false;
        }
        return retChar;
    }

    protected int ifConditional()  throws IOException{
        int retChar=' ';
        myBuffer.append("<msc:if condition='");
        int h=in.read();
        h=in.read();
        while (retChar!='>') {
            retChar=in.read();
            myBuffer.append((char)retChar);
        }
        myBuffer.setLength(myBuffer.length()-2);
        myBuffer.append("' xmlns:msc='http://simonstl.com/projects/o2k/'>");
        retChar=' ';
        generating=false;
        return retChar;
    }

    protected void checkElementType(String elemName) {
        if ((elemName.equals("meta")) ||
            (elemName.equals("link")) ||
            (elemName.equals("br"))) {
                emptyElement=true;
        }
        if (myBuffer.toString().equals("script")) {
            inScript=true;
        }
    }

    protected int feedFromInternalBuffer() {
        //feed out the buffer a character at a time
        int retChar=myBuffer.charAt(0);
        myBuffer.reverse();
        myBuffer.setLength(myBuffer.length()-1);
        myBuffer.reverse();
        return retChar;
    }

    public int read() throws IOException {

        int c = ' '; //return a space if nothing else

        if (docStarted==false) {
            c=startDoc();
        } else {
            if ((generating==false) && (myBuffer.length()==0)) {
                c = in.read();
                if (c=='-') {
                    c=commentTailRemove();
                }
                if (c=='<') {
                  generating=true;
                  //2nd char handling
                  int d = in.read();
                  if ((d=='/') && (inScript==true)) {
                      //end script with CDATA section
                      inScript=false;
                      endScript=true;
                      myBuffer.append("]><");
                  }
                  if (d=='!') { //exclamation
                      int e=in.read();
                      switch ( e ){
                          case ('['):
                              //could be either if or endif
                              c=biConditional();
                              break;
                          case ('-'):
                              int f=in.read();
                              int g=in.read();
                              switch ( g ){
                          case ('['):
                              //an if conditional
                              c=ifConditional();
                              break;
                          default:
                              //just a comment start
                              c=' '; 
                              generating=false;
                              break;
                          }
                      }
                  } else {
                  //after 2nd char, normal element, or within script
                  if (inScript==true) {
                       //go by less than signs
                       generating=false;
                       //internal buffer will get read out to reader, rather building
                       //note that this would make nesting elements in script elements
                       //dangerous
                  } 
                  else {
                      inElementName=true;
                  }
                  emptyElement=false;
                  myBuffer.append((char)d);
                  while ((c!='>') && (generating==true)) {
                      c=in.read();
                      //check for end of element name
                      if (((c=='>') ||(c==' ') || (c=='\r') || (c=='\n'))&& (inElementName==true)) {
                          inElementName=false;
                          checkElementType(myBuffer.toString());
                      }
                      //check for end attributes
                      if ((c=='>') || (c==' ')  || (c=='\r') || (c=='\n')) {
                          if ((inAttribute==true)){
                              String testString=myBuffer.toString();
                              if (testString.indexOf("charset")==-1) {
                                  myBuffer.append('"');
                              }
                              inAttribute=false;
                          }
                      }
                      if ((c=='>') && (emptyElement==true)) {
                          myBuffer.append('/');
                      }
                      myBuffer.append((char) c);
                      //insert CDATA start here
                      if ((c=='>') && (inScript==true)) {
                          myBuffer.append("<![CDATA[");
                      }

                      //check for start attributes
                      if (c=='=') {
                          d=in.read();
                          if ((d != '"') && (d != '\'')) {
                              inAttribute=true;
                              String testString=myBuffer.toString();
                              if (testString.indexOf("charset")==-1){
                                  myBuffer.append('"');
                              }
                          }
                          myBuffer.append((char) d);
                      }
                  }
                  generating=false;
                  if (endScript==false)  {
                      c='<';
                  } else {
                      c=']';
                      endScript=false;
                  }
              }
          }      
      } else {
          if (generating==false) {
              c=feedFromInternalBuffer();
          } 
      }
        }
        return c;
    }

    /**
     * This method is a placeholder - all 'real' activity appears in 
     * the int read() method.  
     * This placeholder is substantially from Java I/O by 
     * Elliotte Rusty Harold, http://www.oreilly.com/catalog/javaio/.
     */
    private boolean endOfStream = false;
  
    public int read(char[] text, int offset, int length) throws IOException {
  
        if (endOfStream) return -1;
        int numRead = 0;
        
        for (int i = offset; i < offset+length; i++) {
            int temp = this.read();
            if (temp == -1) {
              this.endOfStream = true;
              break;
            }
            text[i] = (char) temp;
            numRead++;
        }
        return numRead;
  
    }

    /** This class defines a main() method to test the O2KCleaner */
    public static void main(String[] args) {
        
        if (args.length != 1) {
            System.out.println("Usage: java O2KCleaner <filename>");
            return;       
        }
        try {          
            // Create a stream to read and clean the file
            BufferedReader in = new BufferedReader(new O2KCleaner(new FileReader(args[0])));
            String line;
            while((line = in.readLine()) != null) {
              System.out.println(line);
            }
            in.close();  // Close the stream.
        }
        catch(Exception e) {
            e.printStackTrace();
        }
    }
    
}
