// $Id: DOCTYPEChangerStream.java,v 1.1 2000/10/03 17:13:15 nigelw Exp $ /* * Copyright (c) 1999-2000 by Simon St.Laurent. All Rights Reserved. * * This program is open source software; you may use, copy, modify, and * redistribute it under the terms of the LICENSE with which it was * originally distributed. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * LICENSE for more details. */ //package com.simonstl.xml; import java.io.*; /** *

This class adds a DOCTYPE to an incoming XML document or replaces its existing DOCTYPE declaration if it has one.

* * @author Simon St.Laurent, Nigel Whitaker * @version 0.01 $Date: 2000/10/03 17:13:15 $ */ public final class DOCTYPEChangerStream extends FilterInputStream { public DOCTYPEChangerStream (InputStream in) { super(in); } protected boolean replace=true; protected String rootElement; protected String publicIdentifier=""; protected String systemIdentifier=""; protected String internalSubsetContent=""; protected boolean docStarted=false; protected boolean generating=false; protected boolean cycle=true; protected boolean internalSubset=false; protected StringBuffer myBuffer=new StringBuffer(); /** Use setReplace to indicate whether to replace the DOCTYPE declarations for documents that already have one. False means don't replace, true means do replace. */ public void setReplace(boolean replaceChoice) { replace=replaceChoice; } /** Use setRootElement to set the root element identified by the DOCTYPE declaration. */ public void setRootElement(String elementName){ rootElement=elementName; } /** Use setPublicIdentifier to set the public identifer identified by the DOCTYPE declaration. If set, the result will be <!DOCTYPE rootElement PUBLIC 'publicIdentifier' 'systemIdentifer' [internalSubset, if present]>> . You must also set a system identifier for this to work properly. */ public void setPublicIdentifier(String identifier){ publicIdentifier=identifier; } /** Use setSystemIdentifier to set the public identifer identified by the DOCTYPE declaration. If set without a public identifier, the result will be <!DOCTYPE rootElement SYSTEM 'systemIdentifer' [internalSubset, if present]>> */ public void setSystemIdentifier(String identifier){ systemIdentifier=identifier; } /** Use this method to set the internal subset identified by the DOCTYPE declaration. If set, the result will be <!DOCTYPE rootElement PUBLIC or SYSTEM identifiers [internalSubset]>> */ public void setInternalSubset(String subsetContents){ internalSubsetContent=subsetContents; } /** Returns the root element. Mostly useful to see if you set it previously. */ public String getRootElement() { return rootElement; } /** Returns the public identifier. Mostly useful to see if you set it previously. */ public String getPublicIdentifier() { return publicIdentifier; } /** Returns the system identifier. Mostly useful to see if you set it previously. */ public String getSystemIdentifier() { return systemIdentifier; } /** Returns the internal subset. */ public String getInternalSubset() { return internalSubsetContent; } protected void addDocType(){ myBuffer.append("!DOCTYPE "); myBuffer.append(rootElement); if (publicIdentifier.equals("")) { if (!(systemIdentifier.equals(""))){ myBuffer.append(" SYSTEM '"); myBuffer.append(systemIdentifier); myBuffer.append("'"); } } else { myBuffer.append(" PUBLIC '"); myBuffer.append(publicIdentifier); myBuffer.append("' '"); myBuffer.append(systemIdentifier); myBuffer.append("'"); } myBuffer.append(" "); if (!(internalSubsetContent.equals(""))) { myBuffer.append(" [\n"); myBuffer.append(internalSubsetContent); myBuffer.append("\n]"); } myBuffer.append(">\n"); docStarted=true; } protected int feedFromInternalBuffer() { //feed out the buffer a character at a time int retChar=myBuffer.charAt(0); myBuffer.reverse(); myBuffer.setLength(myBuffer.length()-1); myBuffer.reverse(); return retChar; } public int read() throws IOException { int c =32; if (myBuffer.length()==0) { c=in.read(); if ((c==60) && (docStarted==false)) { //figure out if we have a DOCTYPE declaration int d=in.read(); switch ( d ){ case (63): //question mark, let it go myBuffer.append((char) d); break; case (33): //either comment or DOCTYPE int e=in.read(); if (e==68) { //DOCTYPE! Bingo. //INCLUDE/IGNORE are prohibited from //internal subset, so we'll look for //]> and >. if (replace) { while (cycle==true) { e=in.read(); if (e==91) { internalSubset=true; } if ((e==62) && (internalSubset==false)) { //end of DOCTYPE addDocType(); cycle=false; } if (e==93) { internalSubset=false; } }//end while } else {//end replace myBuffer.append((char) d); myBuffer.append((char) e); }//end else docStarted=true; } else {//e didn't equal 68 myBuffer.append((char) d); myBuffer.append((char) e); } //end e==68 break; default: //root element, need to insert in front addDocType(); myBuffer.append("<"); myBuffer.append((char) d); docStarted=true; break; }//end switch }//end c==60, docStarted==false } else { c=feedFromInternalBuffer(); } return c; } /** This method is a placeholder - all 'real' activity appears in the int read() method. This placeholder is substantially from Java I/O by Elliotte Rusty Harold, http://www.oreilly.com/catalog/javaio/. */ private boolean endOfStream = false; public int read(byte[] text, int offset, int length) throws IOException { if (endOfStream) return -1; int numRead = 0; for (int i = offset; i < offset+length; i++) { int temp = this.read(); if (temp == -1) { this.endOfStream = true; break; } text[i] = (byte) temp; numRead++; } return numRead; } /** This class defines a main() method to test the DOCTYPEChanger */ public static void main(String[] args) { try { if (args.length != 1) throw new IllegalArgumentException("Wrong number of arguments"); // Create a stream to read and clean the file DOCTYPEChangerStream tester=new DOCTYPEChangerStream(new FileInputStream(args[0])); tester.setRootElement("html"); tester.setSystemIdentifier("http://www.simonstl.com/html"); tester.setPublicIdentifier("-//SIMONSTLCOM//DTD tester//EN"); tester.setInternalSubset("this is a test"); tester.setReplace(false); BufferedReader in = new BufferedReader(new InputStreamReader(tester)); String line; while((line = in.readLine()) != null) System.out.println(line); in.close(); // Close the stream. } catch(Exception e) { e.printStackTrace(); System.err.println("Usage: java DOCTYPEChangerStream "); } } /* Comments from Nigel Whitaker regarding using a Stream rather than a Reader: My main comment is that I think extending a FilterInputStream would be a better choice than a FilterReader. Let me try and explain my (and xml-dev's) reasoning/research: I am trying to use DOCTYPEChanger as a direct filter on the input to SAX (well SAX2 as in Xerces' XMLReader). My original code created an org.xml.sax.InputSource and fed this into the parse method. I then tried to add your Filter and started getting validation exceptions, even though I ensured your code was correctly adding a DOCTYPE. The problem arises from the use of a Reader in this chain: java.io.File -> java.io.FileInputStream -> java.io.InputStreamReader -> DOCTYPEChanger -> org.xml.sax.InputSource -> parser.parse() My original working/validating code (with the DOCTYPE added by hand) was: java.io.File -> java.io.FileInputStream -> org.xml.sax.InputSource -> parser.parse() Although the use of Readers and Writers is the prefered way of doing Java IO since Java 1.1, it causes some problems with SAX parsing. It seems that SAX prefers to handle InputStreams of raw bytes because otherwise (with java.lang.Character based Readers) it cannot properly handle the encoding conversion issues of the input file. I did a bit of research and found that this issue was previously covered in a July 1999 xml-dev discussion entitled "encoding problem fixed". It appears that depending on the platform-specific JVM Readers may default to different encodings. See this thread: http://lists.xml.org/archives/xml-dev/199907/msg00413.html David Megginson also issued an "Important SAX Guideline" in this message: http://lists.xml.org/archives/xml-dev/199907/msg00414.html Always use an InputStream in preference to a Reader when you don't know the XML document's character encoding in advance. Users who use DOCTYPEChanger to write a temporary or intermediate file may not encounter this problem, only those who apply the Filter as part of the parsing, as I was doing. Following the guideline seems to have solved my problems. The conversion to a FilterInputStream was a fairly simple change to your code. */ }