[html5] Using validator.nu as a standalone library (Trubin, Stanislav)

Fri Apr 22 13:21:16 PDT 2011

Stan,

It is not a straightforward process for sure. Good luck!!

My first suggestion would be to use Groovy when testing this stuff
rather than Java...much easier and it is easy to then translate to
Java if you need to.

After my signature you will find the smallest amount of Groovy code I
needed to be able to get the nu.validator HTML 5 parser to work
standalone (i.e. outside of the project that can be built using the
build script you can download from the web site).

You will need to download the code for building the validator

http://about.validator.nu/#src

I also had to build a skeleton DocumentNavigator in order to avoid
pulling in lots and lots of complex code. Here is the code for that
skeleton class:

/*
*/
import org.jaxen.dom.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.jaxen.DefaultNavigator;
import org.jaxen.FunctionCallException;
import org.jaxen.Navigator;
import org.jaxen.XPath;
import org.jaxen.JaxenConstants;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.xml.sax.SAXException;

/** Interface for navigating around the W3C DOM Level 2 object model.
 *
 * This class overrides the getElementNamespaceUri method to return a
null namespace
 * so that XPaths generated without any namespace will work
 *
 */
public class HtmlDocumentNavigator extends DocumentNavigator
{
    private final static DocumentNavigator SINGLETON = new
HtmlDocumentNavigator();
    /**
     * Get a constant DocumentNavigator for efficiency.
     *
     * @return a constant instance of a DocumentNavigator.
     */
    public static Navigator getInstance ()
    {
        return SINGLETON;
    }
    /**
     * Default constructor.
     */
    public HtmlDocumentNavigator ()
    {
    }

    /**
     * Get the namespace URI of an element.
     *
     * @param element the target node
     * @return a string (possibly empty) if the node is an element,
     * and null otherwise
     */
    public String getElementNamespaceUri (Object element)
    {
        return null;
    }

    /**
     * Get the namespace URI of an attribute.
     *
     * @param attribute the target node
     *
     * @return the namespace name of the specified node
     *
     */
    public String getAttributeNamespaceUri (Object attribute)
    {
        return null;
    }

}

// end of DocumentNavigator.java

--Dylan

import org.xml.sax.InputSource
import nu.validator.htmlparser.dom.HtmlDocumentBuilder
import nu.validator.xml.PrudentHttpEntityResolver
import nu.validator.xml.DataUriEntityResolver
import nu.validator.localentities.LocalCacheEntityResolver

laxType = true
errorHandler = null // TODO: need to specify error handler to be able
to validate
// Do not know why we need to call the next two lines twice - but we do
httpRes = new nu.validator.xml.PrudentHttpEntityResolver(1024*2048,
laxType, errorHandler)
httpRes.setParams(20000, 20000, 100)
// This duplication is INTENTIONAL
httpRes = new nu.validator.xml.PrudentHttpEntityResolver(1024*2048,
laxType, errorHandler)
httpRes.setParams(20000, 20000, 100)

dataRes = new DataUriEntityResolver(httpRes, laxType, errorHandler)
entityResolver = new LocalCacheEntityResolver(dataRes)
url = "http://amazon.com/"
ins = entityResolver.resolveEntity( null, url)
hdb = new HtmlDocumentBuilder()
doc = hdb.parse( ins)