HtmlConverter

package org.codehaus.staxmate.samples;

import java.io.*;

import javax.xml.stream.*;

import org.codehaus.staxmate.SMIteratorFactory;
import org.codehaus.staxmate.sr.SMIterator;

/**

  • Simple demonstration of using StaxMate on top of StAX, to simplify
  • nested XML parsing: implements a converter from well-formed HTML
  • to a Wiki-line textual output format.
    *<p>
  • General rules for output Wiki-like markup are:
  • <ul>
  • <li>Blocks (~= paragraphs) are separated by one or more empty lines
  • (two or more consequtive linefeeds)
  • </li>
  • <li>There are 4 inline markups; bolding, italics, underline and
  • hyperlink; these are marked by (respectively), **text**,
  • *text*, _text_, [[url ]].
  • </li>
  • <li>Lists are marked lines that start with '*' (unordered) or '#'
  • chars (ordered), followed by one or more spaces and list contents;
  • nested lists are marked by indentation of 2 spaces per nesting level.
  • Only inline markup is allowed inside list items, in addition to
  • sub-lists.
  • </li>
  • <li>Non-nested tables are marked by pipe ('|') character starting a
  • line; each text row represents a table row, and cells are separated
  • by pipe chars as well. Cell or row spans are not supported, nor
  • nested tables; inline markup is allowed inside cells
  • </li>
  • </ul>
    *
  • @author Tatu Saloranta
    */
    public final class HTMLConverter
    {
    private HTMLConverter() { }

private void convert(String filename)
throws IOException, XMLStreamException
{
XMLInputFactory f = XMLInputFactory.newInstance();
// Let's configure factory 'optimally'...
f.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
f.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
// just so it won't try to load DTD in if there's DOCTYPE
f.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE);
f.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
InputStream in = new java.io.FileInputStream(filename);
XMLStreamReader sr = f.createXMLStreamReader(in);

SMIterator it = SMIteratorFactory.rootElementIterator(sr);
it.setElementTracking(SMIterator.TRACK_ELEM_VISIBLE_SIBLINGS);

Writer out = new PrintWriter(System.out);

try

Unknown macro: { processHTML(it, out); }
finally {
try
Unknown macro: { out.flush(); }
catch (Throwable t) { }
sr.close();
try
Unknown macro: { in.close(); }
catch (Throwable t) { }
}
}

private void processHTML(SMIterator it, Writer out)
throws IOException, XMLStreamException
{
it.getNext(); // has to be of type element now...

String origName = it.getCurrentLocalName();
String name = origName.toLowerCase();

/* It should be HTML... but let's also allow lone 'body'

  • as well, for additional robustness
    */
    if (name.equals("body"))
    Unknown macro: { processBody(it, out); }
    else if (!name.equals("html"))
    Unknown macro: { throw new XMLStreamException("Non-HTML document? Root element '" +origName+"'; excepted <HTML> or <html>"); }

SMIterator mainIt = it.childElementIterator();
int type;

while ((type = mainIt.getNext()) != SMIterator.SM_NODE_NONE) {
origName = mainIt.getCurrentLocalName();
name = origName.toLowerCase();

// Should be 'head' or 'body'
if (name.equals("head"))

Unknown macro: { processHead(mainIt, out); }
else if (name.equals("body"))
Unknown macro: { processBody(mainIt, out); }
else
Unknown macro: { throw new XMLStreamException("Non-HTML document? Unexpected element '" +origName+"'; under <HTML>."); }

}
}

/**

  • Simple handler for HEAD section of a html document. Only looks for
  • title element (for now); returns as soon as that's gotten.
    */
    private void processHead(SMIterator parentIt, Writer out)
    throws IOException, XMLStreamException
    {
    SMIterator headIt = parentIt.childElementIterator();
    int type;
    while ((type = headIt.getNext()) != SMIterator.SM_NODE_NONE)
    Unknown macro: { if (headIt.getCurrentLocalName().toLowerCase().equals("title"))
    Unknown macro: { // Could capitalize it too... out.write("== "); String str = SMIteratorFactory.collectDescendantText(headIt, true); // Let's remove linefeeds if there was any addSingleLine(out, str); out.write(" ==nn"); // Ok, that's it, we don't care about other stuff break; }
    }

    }

/**

  • Simple handler for BODY section of a html document.
  • Has special handling for some elements (paragraphs, lists,
  • tables, links).
    */
    private void processBody(SMIterator parentIt, Writer out)
    throws IOException, XMLStreamException
    {
    /* We need both elements and text content (but not comments etc);
  • further, due to loose nesting of HTML, let's just do flat
  • iteration in general, as we can still do sub-scoping for
  • specific elements (tables etc)
    */
    SMIterator bodyIt = parentIt.descendantMixedIterator();
    int type;
    StringBuffer text = null; // for collected 'loose' text

while ((type = bodyIt.getNext()) != SMIterator.SM_NODE_NONE) {
// Let's weed out end elements right away...
if (type == XMLStreamConstants.END_ELEMENT)

Unknown macro: { continue; }

// And straight text as well:
String inline;
if (type == XMLStreamConstants.START_ELEMENT) {
String tag = bodyIt.getCurrentLocalName().toLowerCase();
if (processBlockElement(bodyIt, out, tag, text))

Unknown macro: { // true -> was succesfully handled text = null; continue; }

/* Ok; not a block we recognized... but maybe a well-known

  • inline element?
    */
    inline = checkInlineMarkup(bodyIt, tag);
    } else
    Unknown macro: { inline = bodyIt.getCurrentText(); }

if (inline != null) {
if (text == null)

Unknown macro: { text = new StringBuffer(inline); }
else
Unknown macro: { text.append(inline); }

}
} // while (...)

if (text != null)

Unknown macro: { addPara(out, text); text = null; }

}

/**

  • Method that is used to figure out type and handling of a node,
  • at block level scope (but not from inside tables and lists)
    */
    private boolean processBlockElement(SMIterator it, Writer out, String tag,
    StringBuffer text)
    throws IOException, XMLStreamException
    {
    // We'll only get START_ELEMENT events here

if (tag.charAt(0) == 'h' && tag.length() == 2) {
char c = tag.charAt(1);
// heading?
if (c >= '1' && c <= '5') {
if (text != null)

Unknown macro: { addPara(out, text); }

processHeading(it, out, (c - '1'));
return true;
}
}

/* Handling of paragraphs depends on whether it's a main level

  • thing or not
    */
    if (tag.equals("p") || tag.equals("blockquote"))
    Unknown macro: { // (no special handling for blockquote currently) addPara(out, text); /* Let's recursively call the main loop, and then add an* empty line after it. */ processBody(it, out); out.write("nn"); return true; }

    if (tag.equals("pre"))
    Unknown macro: { addPara(out, text); // Can't have any markup in there... String str = SMIteratorFactory.collectDescendantText(it, true); if (str.length() > 0)
    Unknown macro: { addPara(out, str); }
    return true; }

    if (tag.equals("ul") || tag.equals("o"))
    Unknown macro: { addPara(out, text); processList(it, out, (tag.charAt(0) == 'u') ? '*' }

    if (tag.equals("table"))
    Unknown macro: { addPara(out, text); processTable(it, out, false); return true; }

// Not a recognized (or handlable) block element
return false;
}

private void processHeading(SMIterator it, Writer out, int depth)
throws IOException, XMLStreamException
{
depth += 2;
if (depth > 5)

Unknown macro: { depth = 5; }

String prefix = "=====".substring(0, depth);
out.write(prefix);
out.write(' ');
SMIteratorFactory.processDescendantText(it, out, true);
out.write(' ');
out.write(prefix);
out.write("\n\n");
}

private void processList(SMIterator it, Writer out, char type, int depth)
throws IOException, XMLStreamException
{
/* Let's assume child elements have to be 'li' elements or

  • sublists ('ul', 'ol'); and ignore everything else.
    */
    SMIterator listIt = it.childElementIterator();

// We'll only get START_ELEMENTs here except for EOF:
while (listIt.getNext() != SMIterator.SM_NODE_NONE) {
String tag = listIt.getCurrentLocalName().toLowerCase();
if (tag.equals("li"))

Unknown macro: { processListItem(listIt, out, type, depth); }
else if (tag.equals("ul"))
Unknown macro: { processList(listIt, out, '*', depth+1); }
else if (tag.equals("ol"))
Unknown macro: { processList(listIt, out, '#', depth+1); }
else
Unknown macro: { /* could add warnings, or append content to previous item,* or create a list heading... whatever */ }

}

// And finally, trailing empty line, but only for main-level lists
if (depth == 0)

Unknown macro: { out.write('n'); }

}

private void processListItem(SMIterator it, Writer out, char listType, int depth)
throws IOException, XMLStreamException
{
// Ok, list item marker:
for (int i = 0; i < depth; ++i)

Unknown macro: { out.write(" "); // 2 space indentation }

out.write(listType);
out.write(' ');

/* List item contents are more varied; text, inline markup; maybe

  • even sublists.
    */
    SMIterator itemIt = it.childMixedIterator();
    int type;

while ((type = itemIt.getNext()) != SMIterator.SM_NODE_NONE) {
if (type == XMLStreamConstants.START_ELEMENT) {
String tag = itemIt.getCurrentLocalName().toLowerCase();
// only care about sub-lists:
if (tag.equals("ul") || tag.equals("ol"))

Unknown macro: { out.write('n'); // to finish off the current line processList(itemIt, out, (tag.charAt(0) == 'u') ? '*' }
else { // can also process inline markup
String str = checkInlineMarkup(itemIt, tag);
if (str != null)
Unknown macro: { addSingleLine(out, str); continue; }

}
// Otherwise, let's just collect and output text:
addSingleLine(out, SMIteratorFactory.collectDescendantText(itemIt, true));
} else
Unknown macro: { addSingleLine(out, itemIt.getCurrentText()); }

}
out.write('\n');
}

private void processTable(SMIterator it, Writer out, boolean header)
throws IOException, XMLStreamException
{
/* Let's assume child elements have to be 'tr', or one of grouping

  • elements ('thead', 'tfoot' or 'tbody'), and ignore everything else.
    */
    SMIterator tableIt = it.childElementIterator();
    // We'll only get START_ELEMENTs here except for EOF:
    while (tableIt.getNext() != SMIterator.SM_NODE_NONE)
    Unknown macro: { String tag = tableIt.getCurrentLocalName().toLowerCase(); if (tag.equals("thead") || tag.equals("tfoot")|| tag.equals("tbody"))
    Unknown macro: {\ /* Let's just recursively call this method, should be ||* safe? */ processTable(tableIt, out, header || tag.equals("thead")); }
    else if (tag.equals("tr"))
    Unknown macro: { processTableRow(tableIt, out, header); }
    // and ignore others.... }

    // Let's add empty line as paragraph separator...
    out.write("\n");
    }

private void processTableRow(SMIterator it, Writer out, boolean headerRow)
throws IOException, XMLStreamException
{
// Let's assume only 'tr' elements are encountered...
SMIterator rowIt = it.childElementIterator();
out.write("|");
// We'll only get START_ELEMENTs here except for EOF:
while (rowIt.getNext() != SMIterator.SM_NODE_NONE) {
String tag = rowIt.getCurrentLocalName().toLowerCase();
if (tag.equals("td"))

Unknown macro: { processTableCell(rowIt, out, headerRow); }
else if (tag.equals("th"))
Unknown macro: { processTableCell(rowIt, out, true); }
else

}
out.write("|");
}
// Let's add lf, to separate rows...
out.write("\n");
}

private void processTableCell(SMIterator it, Writer out, boolean headerCell)
throws IOException, XMLStreamException
{
/* Cells can have varied content, though... generally we only care

  • about text and inline markup, though.
    */
    SMIterator cellIt = it.childMixedIterator();
    int type;
    while ((type = cellIt.getNext()) != SMIterator.SM_NODE_NONE) {
    if (type == XMLStreamConstants.START_ELEMENT)
    Unknown macro: { String tag = cellIt.getCurrentLocalName().toLowerCase(); // No sub-tables or lists allowed... just inline markup String str = checkInlineMarkup(cellIt, tag); if (str != null)
    Unknown macro: { addSingleLine(out, str); continue; }
    // Otherwise, let's just collect and output text}
    else
    Unknown macro: { // just plain text addSingleLine(out, cellIt.getCurrentText()); }

    }
    }

private String checkInlineMarkup(SMIterator it, String tag)
throws IOException, XMLStreamException
{
if (tag.equals("a"))

Unknown macro: { XMLStreamReader sr = it.getStreamReader(); String url = sr.getAttributeValue(null, "href"); String str = SMIteratorFactory.collectDescendantText(it, true); return "[["+url+" | "+str+"]]"; }

if (tag.equals("b"))
Unknown macro: { String str = SMIteratorFactory.collectDescendantText(it, true); return "'''"+str+"'''"; }


if (tag.equals("i"))


}
if (tag.equals("u"))
Unknown macro: { String str = SMIteratorFactory.collectDescendantText(it, true); return "___"__+str+__"___"; }

if (tag.equals("hr"))
Unknown macro: { return "n-----n"; }

if (tag.equals("br"))
Unknown macro: { // Hmmh. This won't work too well... return "n"; }

// Nope, inline markup not recognized (or no effect can be applied)
return null;
}

/**

  • Method called to output "unwrapped" text (either not contained in
  • any element, or in unrecognized one). Let's just output it as
  • is, but add paragraph separator after the text.
    */
    private void addPara(Writer out, StringBuffer textBuf)
    throws IOException
    Unknown macro: { addPara(out, textBuf.toString()); }

private void addPara(Writer out, String text)
throws IOException
{
/* Let's remove all linefeeds from the start, and from the end,

  • to make sure we won't have excessive empty lines...
    */
    int len = text.length();
    int i = 0;
    while (i < len)
    Unknown macro: { char c = text.charAt(i); if (c != 'r' && c != 'n')
    Unknown macro: { break; }

    ++i;
    }
    if (i > 0)

    Unknown macro: { text = text.substring(i); }

    i = len = text.length()-1;
    while (i >= 0) {
    char c = text.charAt;
    if (c != '\r' && c != '\n')

    }

    --i;
    }
    if (i < len)
    Unknown macro: { text = text.substring(0, i+1); }

// Also, let's see if there's any non-space stuff left?
if (text.trim().length() > 0)

Unknown macro: { out.write(text); out.write("nn"); }

}

/**

  • Simple (although not very efficient) method that'll replace linefeeds
  • with single space chars and output results
    */
    private void addSingleLine(Writer out, String text)
    throws IOException
    {
    // Need to replace linefeeds, that's all
    BufferedReader br = new BufferedReader(new StringReader(text));
    String line;
    boolean first = true;

while ((line = br.readLine()) != null) {
if (first)

Unknown macro: { first = false; }
else
Unknown macro: { out.write(' '); }

out.write(line);
}
}

public static void main(String[] args)
throws Exception
{
if (args.length != 1)

Unknown macro: { System.err.println("Usage}

new HTMLConverter().convert(args[0]);
}
}

Labels

 
(None)