/*
* Copyright 2000 by Simon St.Laurent. All Rights Reserved.
*
* This program is open source software; you may use, copy, modify, and
* redistribute it under the terms of the LICENSE with which it was
* originally distributed.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* LICENSE for more details.
*/
//package org.simonstl.xml;
import java.io.*;
/**
*
This class strips out the comment marks and the
* <!--[if...]><![endif]--> constructs from
* Microsoft Word 2000 HTML. It also adds quotation marks to
* attributes when necessary and wraps scripts in CDATA sections.
* A simple DOCTYPE declaration includes the entities needed.
*
* Many thanks to Elliotte Rusty Harold for his substantial readability
* cleanup and his finding (and fixing) an encoding-related problem.
*
* @author Simon St.Laurent
* @version 0.03 $Date: 2000/08/03 $
*/
public final class O2KCleaner extends FilterReader
{
public O2KCleaner (InputStream in) {
this(new InputStreamReader(in));
}
public O2KCleaner (Reader in) {
super(new BufferedReader(in));
}
protected boolean docStarted = false;
protected boolean generating = false;
protected boolean inAttribute = false;
protected boolean inElementName = false;
protected boolean emptyElement = false;
protected boolean inScript = false;
protected boolean endScript = false;
protected String encoding = "ISO-8859-1";
protected StringBuffer myBuffer = new StringBuffer();
/** returns the encoding that will be put into the XML declaration.
ISO-8859-1 is the default.
*/
public String getEncoding() {
return encoding;
}
/** allows you to set the encoding that will be put into the XML declaration.
ISO-8859-1 is the default.
*/
public void setEncoding(String newEncoding) {
encoding=newEncoding;
}
protected int startDoc(){
myBuffer.append("?xml version=\"1.0\" encoding=\"");
myBuffer.append(encoding);
myBuffer.append("\"?>\r\n");
myBuffer.append("\r\n");
myBuffer.append("%lat1;\r\n");
myBuffer.append(
"\r\n");
myBuffer.append("%special;\r\n");
myBuffer.append(
"\r\n");
myBuffer.append("%symbols;\r\n");
myBuffer.append("]>\r\n");
docStarted=true;
return '<';
}
protected int commentTailRemove() throws IOException {
//knock off comment tails
int d=in.read();
myBuffer.append((char) d);
if (d=='-'){
int e=in.read();
myBuffer.append((char) e);
if (e=='>'){
myBuffer.setLength(0);
return ' ';
}
}
return '-';
}
protected int biConditional () throws IOException {
int retChar=' ';
//need to see if it's an if or endif. not consistent in MS
int eori=in.read();
if (eori=='e') {
while (retChar!='>') {
retChar=in.read();
}
myBuffer.setLength(0);
myBuffer.append(" ");
//msc is Microsoft Conditional
retChar=' ';
generating=false;
}
if (eori=='i') {
myBuffer.append("') {
retChar=in.read();
myBuffer.append((char)retChar);
}
myBuffer.setLength(myBuffer.length()-2);
myBuffer.append(
"' xmlns:msc='http://simonstl.com/projects/o2k/'>");
//msc is Microsoft Conditional
retChar=' ';
generating=false;
}
return retChar;
}
protected int ifConditional() throws IOException{
int retChar=' ';
myBuffer.append("') {
retChar=in.read();
myBuffer.append((char)retChar);
}
myBuffer.setLength(myBuffer.length()-2);
myBuffer.append("' xmlns:msc='http://simonstl.com/projects/o2k/'>");
retChar=' ';
generating=false;
return retChar;
}
protected void checkElementType(String elemName) {
if ((elemName.equals("meta")) ||
(elemName.equals("link")) ||
(elemName.equals("br"))) {
emptyElement=true;
}
if (myBuffer.toString().equals("script")) {
inScript=true;
}
}
protected int feedFromInternalBuffer() {
//feed out the buffer a character at a time
int retChar=myBuffer.charAt(0);
myBuffer.reverse();
myBuffer.setLength(myBuffer.length()-1);
myBuffer.reverse();
return retChar;
}
public int read() throws IOException {
int c = ' '; //return a space if nothing else
if (docStarted==false) {
c=startDoc();
} else {
if ((generating==false) && (myBuffer.length()==0)) {
c = in.read();
if (c=='-') {
c=commentTailRemove();
}
if (c=='<') {
generating=true;
//2nd char handling
int d = in.read();
if ((d=='/') && (inScript==true)) {
//end script with CDATA section
inScript=false;
endScript=true;
myBuffer.append("]><");
}
if (d=='!') { //exclamation
int e=in.read();
switch ( e ){
case ('['):
//could be either if or endif
c=biConditional();
break;
case ('-'):
int f=in.read();
int g=in.read();
switch ( g ){
case ('['):
//an if conditional
c=ifConditional();
break;
default:
//just a comment start
c=' ';
generating=false;
break;
}
}
} else {
//after 2nd char, normal element, or within script
if (inScript==true) {
//go by less than signs
generating=false;
//internal buffer will get read out to reader, rather building
//note that this would make nesting elements in script elements
//dangerous
}
else {
inElementName=true;
}
emptyElement=false;
myBuffer.append((char)d);
while ((c!='>') && (generating==true)) {
c=in.read();
//check for end of element name
if (((c=='>') ||(c==' ') || (c=='\r') || (c=='\n'))&& (inElementName==true)) {
inElementName=false;
checkElementType(myBuffer.toString());
}
//check for end attributes
if ((c=='>') || (c==' ') || (c=='\r') || (c=='\n')) {
if ((inAttribute==true)){
String testString=myBuffer.toString();
if (testString.indexOf("charset")==-1) {
myBuffer.append('"');
}
inAttribute=false;
}
}
if ((c=='>') && (emptyElement==true)) {
myBuffer.append('/');
}
myBuffer.append((char) c);
//insert CDATA start here
if ((c=='>') && (inScript==true)) {
myBuffer.append("");
return;
}
try {
// Create a stream to read and clean the file
BufferedReader in = new BufferedReader(new O2KCleaner(new FileReader(args[0])));
String line;
while((line = in.readLine()) != null) {
System.out.println(line);
}
in.close(); // Close the stream.
}
catch(Exception e) {
e.printStackTrace();
}
}
}