<< Ubuntu安装memcached的“C compiler cannot create executables”问题 | 首页 | Oracle启动时ORA-00600: 内部错误代码,参数: [kcratr1_lostwrt] >>

使用nekohtml完全控制并修整html

除了TidyHtmlCleaner,还可以使用nekohtml完全解析、控制并修整html标签内容。

如下代码:

/*
* Created on Dec 28, 2004
*
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/
package edu.columbia.ais.portal.utils;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.filters.DefaultFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.InputSource;

/**
* @author dgrimwood
*
* TODO To change the template for this generated type comment go to Window -
* Preferences - Java - Code Style - Code Templates
*/
public class XHTMLParser {
public static void parseHTMLFragment(DocumentFragment frag, String html)
throws Exception {
try {
DOMFragmentParser parser = new DOMFragmentParser();
parser
.setFeature(
"http://cyberneko.org/html/features/balance-tags/document-fragment",
true);
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"UTF-8");
parser.setFeature("http://cyberneko.org/html/features/report-errors",true);
parser
.setProperty(
"http://cyberneko.org/html/properties/names/elems",
"match");
parser.setProperty(
"http://cyberneko.org/html/properties/names/attrs",
"no-change");
ElementRemover remover = new ElementRemover();
remover.acceptElement("a", new String[] { "href", "target", "name",
"rel", "shape", "coords", "title" });
remover.acceptElement("area", new String[] { "alt", "coords",
"href", "nohref", "shape", "target" });
remover.acceptElement("b", null);
remover.acceptElement("big", null);
remover.acceptElement("blockquote", new String[] { "cite" });
remover.acceptElement("br", null);
remover.acceptElement("caption", null);
remover.acceptElement("center", null);
remover.acceptElement("code", null);
remover.acceptElement("div", new String[] { "align" });
remover.acceptElement("em", null);
remover.acceptElement("form", new String[] { "action", "enctype",
"method", "type", "target" });
remover.acceptElement("h1", null);
remover.acceptElement("h2", null);
remover.acceptElement("h3", null);
remover.acceptElement("h4", null);
remover.acceptElement("h5", null);
remover.acceptElement("h6", null);
remover.acceptElement("hr", null);
remover.acceptElement("i", null);
remover.acceptElement("img", new String[] { "src", "width",
"height", "border", "vspace", "hspace", "alt", "title",
"align", "ismap", "usemap" });
remover.acceptElement("input", new String[] { "alt", "align",
"checked", "disabled", "maxlength", "name", "readonly",
"src", "size", "type", "value" });
remover.acceptElement("li", new String[] { "type", "value" });
remover.acceptElement("map", new String[] { "id", "name" });
remover.acceptElement("ol", new String[] { "compact", "start",
"type" });
remover.acceptElement("option", new String[] { "disabled", "label",
"selected", "value" });
remover.acceptElement("p", new String[] { "align" });
remover.acceptElement("pre", new String[] { "width" });
remover.acceptElement("s", null);
remover.acceptElement("select", new String[] { "disabled",
"multiple", "name", "size" });
remover.acceptElement("small", null);
remover.acceptElement("strike", null);
remover.acceptElement("strong", null);
remover.acceptElement("sub", null);
remover.acceptElement("sup", null);
remover.acceptElement("table", new String[] { "align", "border",
"cellpadding", "cellspacing", "summary", "width" });
remover.acceptElement("td",
new String[] { "abbr", "align", "colspan", "height",
"nowrap", "rowspan", "valign", "width" });
remover.acceptElement("textarea", new String[] { "cols", "rows",
"disabled", "readonly", "name" });
remover.acceptElement("th",
new String[] { "abbr", "align", "colspan", "height",
"nowrap", "rowspan", "valign", "width" });
remover.acceptElement("tr", new String[] { "align", "valign" });
remover.acceptElement("tt", null);
remover.acceptElement("u", null);
remover.acceptElement("ul", new String[] { "compact", "type" });
remover.acceptElement("var", null);

remover.removeElement("head");
remover.removeElement("style");
remover.removeElement("meta");
remover.removeElement("link");
remover.removeElement("title");
remover.removeElement("script");
remover.removeElement("noscript");

HTMLCleaner cleaner = new HTMLCleaner();
cleaner.acceptEmptyElement("area");
cleaner.acceptEmptyElement("br");
cleaner.acceptEmptyElement("img");
cleaner.acceptEmptyElement("input");
cleaner.acceptEmptyElement("hr");
cleaner.acceptEmptyElement("li");
cleaner.acceptEmptyElement("option");
cleaner.acceptEmptyElement("p");
cleaner.acceptEmptyElement("select");
cleaner.acceptEmptyElement("td");
cleaner.acceptEmptyElement("textarea");
cleaner.acceptEmptyElement("th");
cleaner.acceptEmptyElement("tr");
cleaner.acceptEmptyElement("td");
cleaner.acceptEmptyElement("textarea");
cleaner.translateTag("i", "em");
cleaner.translateTag("b", "strong");

XMLDocumentFilter[] filters = new XMLDocumentFilter[] { remover,
cleaner };

parser.setProperty("http://cyberneko.org/html/properties/filters",
filters);
parser.parse(new InputSource(new StringReader(html)), frag);
} catch (Exception e) {
throw e;
}
}

protected static class HTMLCleaner extends DefaultFilter implements
XMLDocumentFilter {
ArrayList acceptEmpty = new ArrayList();

HashMap translateTag = new HashMap();

boolean stripComments = true;

boolean stripPI = true;

boolean ignoreWhiteSpace = false;

public void acceptEmptyElement(String tagname) {
acceptEmpty.add(tagname.toLowerCase());
}

public void translateTag(String fromtag, String totag) {
translateTag.put(fromtag.toLowerCase(), totag);
}

void translateQName(QName element) {
String newTag = (String) translateTag.get(element.localpart
.toLowerCase());
if (newTag != null) {
//System.out.println("Replacing start "+element.localpart+"
// with "+newTag);
element.setValues(null, newTag, newTag, null);
}
}

boolean emptyAccepted(QName element) {
return acceptEmpty.contains(element.localpart.toLowerCase());
}

public void emptyElement(QName element, XMLAttributes attributes,
Augmentations augs) throws XNIException {
translateQName(element);
if (emptyAccepted(element)) {
super.emptyElement(element, attributes, augs);
//System.out.println("passing empty element:
// "+element.localpart);
}
}

public void characters(XMLString text, Augmentations augs) {
if (!ignoreWhiteSpace || text.toString().trim().length() > 0) {
super.characters(text, augs);
}
}

public void ignorableWhitespace(XMLString text, Augmentations augs) {
if (!ignoreWhiteSpace) {
super.ignorableWhitespace(text, augs);
}
}

public void startElement(QName element, XMLAttributes attributes,
Augmentations augs) throws XNIException {
translateQName(element);
super.startElement(element, attributes, augs);
}

public void endElement(QName element, Augmentations augs)
throws XNIException {
translateQName(element);
super.endElement(element, augs);
}

public void comment(XMLString text, Augmentations augs) {
if (!stripComments) {
super.comment(text, augs);
}
}

public void processingInstruction(java.lang.String target,
XMLString data, Augmentations augs) {
if (!stripPI) {
super.processingInstruction(target, data, augs);
}
}

}

}
标签 : ,



发表评论 发送引用通报