Filter:   InfoImg
download NekoHtmlParser.java
Language: Java
License: AL20
Copyright: Copyright 2004 Outerthought bvba and Schaubroeck nv
LOC: 67
Project Info
Daisy
Server: CocoonDev
Type: svn
...g\outerj\daisy\htmlcleaner\
   ElementDescriptor.java
   ...oCorruptTagCleaner.java
   HtmlCleaner.java
   HtmlCleanerFactory.java
   HtmlCleanerTemplate.java
   HtmlRepairer.java
   ...acterEventsHandler.java
   NekoHtmlParser.java
   ...tElementDescriptor.java
   StylingHtmlSerializer.java
   xhtml-lat1.ent
   xhtml-special.ent
   xhtml-symbol.ent
   xhtml1-strict.dtd
   ...lDescriptorBuilder.java

/*
 * Copyright 2004 Outerthought bvba and Schaubroeck nv
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.outerj.daisy.htmlcleaner;

import org.xml.sax.*;
import org.xml.sax.helpers.AttributesImpl;
import org.cyberneko.html.parsers.SAXParser;
import org.outerj.daisy.xmlutil.SaxBuffer;

import java.io.IOException;
import java.io.StringReader;

/**
 * Parses HTML files using the Neko HTML parser. Puts all elements and attribute
 * names to lowercase, removes all namespaces, produces well-formed XML.
 */
class NekoHtmlParser {
    public SaxBuffer parse(String html) throws IOException, SAXException {
        if (html == null)
            throw new NullPointerException("html string argument is required.");

        InputSource is = new InputSource();
        is.setCharacterStream(new StringReader(html));

        SAXParser parser = new SAXParser();
        parser.setFeature("http://xml.org/sax/features/namespaces", true);
        parser.setFeature("http://cyberneko.org/html/features/override-namespaces", false);
        parser.setFeature("http://cyberneko.org/html/features/insert-namespaces", false);
        parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
        parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8");
        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");

        SaxBuffer buffer = new SaxBuffer();
        parser.setContentHandler(new RemoveNamespacesHandler(new MergeCharacterEventsHandler(buffer)));
        parser.parse(is);

        return buffer;
    }

    /**
     * A ContentHandler that drops all namespace information.
     */
    static class RemoveNamespacesHandler implements ContentHandler {
        private ContentHandler consumer;

        public RemoveNamespacesHandler(ContentHandler consumer) {
            this.consumer = consumer;
        }

        public void endDocument() throws SAXException {
            consumer.endDocument();
        }

        public void startDocument() throws SAXException {
            consumer.startDocument();
        }

        public void characters(char ch[], int start, int length) throws SAXException {
            consumer.characters(ch, start, length);
        }

        public void ignorableWhitespace(char ch[], int start, int length) throws SAXException {
            consumer.ignorableWhitespace(ch, start, length);
        }

        public void endPrefixMapping(String prefix) throws SAXException {
            // dropped on purpose
        }

        public void skippedEntity(String name) throws SAXException {
            // dropped on purpose
        }

        public void setDocumentLocator(Locator locator) {
            consumer.setDocumentLocator(locator);
        }

        public void processingInstruction(String target, String data) throws SAXException {
            // dropped on purpose
        }

        public void startPrefixMapping(String prefix, String uri) throws SAXException {
            // dropped on purpose
        }

        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
            consumer.endElement("", localName, localName);
        }

        public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
            AttributesImpl newAtts = new AttributesImpl(atts);
            for (int i = 0; i < atts.getLength(); i++) {
                newAtts.setURI(i, "");
                newAtts.setQName(i, newAtts.getLocalName(i));
            }
            consumer.startElement("", localName, localName, atts);
        }
    }

}