在上一篇文章里说到了SAX的快速入门现在让我们来看看它的一个具体应用吧。
现在有这样的一个XML document:
- <?xml version="1.0" encoding="UTF-8"?>
- <store>
- <other_tag1 />
- <type name="book" type_id="1" />
- <other_tag3 />
- <bookstore>
- <other_tag2 />
- <address addr="Shanghai,China" />
- <other_tag4 />
- <book category="COOKING" title="Everyday Italian" author="Giada De Laurentiis" year="2005" price="30.00" />
- <book category="CHILDREN" title="Harry Potter" author="J K. Rowling" year="2005" price="29.99" />
- </bookstore>
- </store>
我们要利用SAX提取<type name="book" type_id="1" />, <address addr="Shanghai,China" />, <book/>这几个nodes的信息,我们因该怎么做呢?
现在有这样的一个思路:利用SAX,当一遇到node的localName 为“type”,"address","book"的时候,就停下来抓取信息。这时候,我觉得一个方法就是在
startElement里面加上if/ else if/ else 这样的判断,这样虽然很直接明了,但是很傻,因为如果要解析的xml内容少还好,如果要抓取的信息量极大的话,那得
要多少个if/ else if/ else ?!现在我们换一个新的方法来处理这件事情:把需要追踪的元素形成一个“类似XML元素树”,例如,我们要追踪的全部元素为“store”,
“type”,"bookstore","book",树的结构为:
- store
- |
- --------------
- | |
- type bookstore
- |
- -----------
- | |
- address book
追踪的代码:
形成“XML元素树”的关键代码,这是TagTracker.java里的一部分:
- root.track("store",new StoreTracker());
- root.track("store/type",new TypeTracker());
- root.track("store/bookstore",new BookStoreTracker());
- root.track("store/bookstore/address",new AddrTracker());
- root.track("store/bookstore/book",new BookTracker());
- public void track(String tagName,TagTracker tracker) {
- int slashOffset = tagName.indexOf("/");
- if(slashOffset < 0) {
- trackers.put(tagName,tracker);
- } else if(slashOffset == 0) {
- // "/a/b" --> "a/b" and continue.
- track(tagName.substring(1),tracker);
- } else {
- String topTagName = tagName.substring(0,slashOffset);
- String remainderOfTagName = tagName.substring(slashOffset + 1);
- TagTracker child = trackers.get(topTagName);
- if(child == null) {
- child = new TagTracker();
- trackers.put(topTagName,child);
- }
- child.track(remainderOfTagName,tracker);
- }
- }
这样,在整个"root" element下有"store"节点,在“store”下有“type”和“bookstore”节点,在"bookstore"下又有“address”和"book"节点,形成了我们实际需要追踪的
“元素树”.
我们知道,SAX是通过startElement(String namespaceURI,String localName,String qName,Attributes attr)和 endElement(String namespaceURI,String qName)来获取元素的uri,localName,attributes等诸如信息的(如果要获取元素里的文本信息,需要调用characters(char[] ch,int start,int length))。每一个元素都应该有一个特定的的方法来收集这个元素的信息(包括元素里的文本信息):
而SAX的startElement和endElement方法会分别调用上述的两个方法,把namespaceURI,qName,attributes等信息传递过去
- // get information of the element "type"
- private class TypeTracker extends TagTracker {
- public TypeTracker() {
- }
- @Override
- public void onStart(String namespaceURI,Attributes attr) throws Exception {
- String name = attr.getValue("name");
- String typeId = attr.getValue("type_id");
- // handle these info. ...
- }
- @Override
- public void onEnd(String namespaceURI,CharArrayWriter contents) {
- // get the characters data inside the element
- String text = contents.toString();
- // handle this text...
- }
- }
好了,现在让我们来看看,是怎么让SAX挑出我们所需要追踪的元素并解析的吧(自动忽略其他的元素)。
TagTracker.java的全部代码:
在startElement中,我们会去“元素树”中选择当前的元素,从而去判断当前的元素释放应该被解析:
- package com.desmond.xml.sax;
- import java.io.CharArrayWriter;
- import java.util.Hashtable;
- import java.util.Stack;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.xml.sax.Attributes;
- public class TagTracker {
- private static Log log = LogFactory.getLog(TagTracker.class);
- private Hashtable<String,TagTracker> trackers = new Hashtable<String,TagTracker>();
- // use to skip these un-choiced elements
- private static SkippingTagTracker skip = new SkippingTagTracker();
- public TagTracker() {
- }
- /**
- * track all elements need to be tracked.
- * @param tagName the absolute path of the tracked element
- * @param tracker the detail handler to parse a special element
- */
- public void track(String tagName,TagTracker tracker) {
- int slashOffset = tagName.indexOf("/");
- if (slashOffset < 0) {
- // if it is a simple tag name (no "/" sperators) simple add it.
- trackers.put(tagName,tracker);
- } else if (slashOffset == 0) {
- // "/a/b" --> "a/b" and continue.
- track(tagName.substring(1),slashOffset);
- String remainderOfTagName = tagName.substring(slashOffset + 1);
- TagTracker child = trackers.get(topTagName);
- if (child == null) {
- child = new TagTracker();
- trackers.put(topTagName,child);
- }
- child.track(remainderOfTagName,tracker);
- }
- }
- /**
- * start to parse a element,which will be invoked by SAX's startElement.
- * @param namespaceURI
- * @param localName
- * @param qName
- * @param attr
- * @param tagStack "tracked element tree"
- * @throws Exception
- */
- public void startElement(String namespaceURI,Attributes attr,Stack<TagTracker> tagStack)
- throws Exception {
- TagTracker tracker = trackers.get(localName);
- // not found this tag track.
- if (tracker == null) {
- log.debug("Skipping tag:[" + localName + "]");
- tagStack.push(skip);
- } else {
- log.debug("Tracking tag:[" + localName + "]");
- onDeactivate();
- tracker.onStart(namespaceURI,attr);
- tagStack.push(tracker);
- }
- }
- /**
- * end to parse a element,which will be invoked by SAX's endElement.
- * @param namespaceURI
- * @param localName
- * @param qName
- * @param contents
- * @param tagStack current element
- * @throws Exception
- */
- public void endElement(String namespaceURI,CharArrayWriter contents,Stack tagStack) throws Exception {
- log.debug("Finished tracking tag:[" + localName + "]");
- try {
- onEnd(namespaceURI,contents);
- } catch (Exception e) {
- e.printStackTrace();
- throw e;
- }
- // clean up the stack
- tagStack.pop();
- // send the reactivate event
- TagTracker activeTracker = (TagTracker) tagStack.peek();
- if (activeTracker != null) {
- log.debug("Reactivating pervIoUs tag tracker.");
- activeTracker.onReactivate();
- }
- }
- /**
- * detail method to start to parse the special element.
- * @param namespaceURI
- * @param localName
- * @param qName
- * @param attr
- * @throws Exception
- */
- public void onStart(String namespaceURI,Attributes attr) throws Exception {
- }
- public void onDeactivate() throws Exception {
- }
- /**
- * detail method to end to parse the special element.
- * @param namespaceURI
- * @param localName
- * @param qName
- * @param contents
- */
- public void onEnd(String namespaceURI,CharArrayWriter contents) {
- }
- public void onReactivate() throws Exception {
- }
- }
- TagTracker tracker = trackers.get(localName);
- // not found this tag track.
- if (tracker == null) {
- log.debug("Skipping tag:[" + localName + "]");
- tagStack.push(skip);
- } else {
- log.debug("Tracking tag:[" + localName + "]");
- onDeactivate();
- tracker.onStart(namespaceURI,attr);
- tagStack.push(tracker);
- }
如果tracker为null,说明这个元素不是我们想要解析的那些,因此要"跳过",如何去跳过,这里用到了另一个类SkippingTagTracker,它所做的事情就是去跳过这个
元素,代码如下:
如果tracker有值,我们就要开始解析这个元素了。这时,我们调用前面提到的”一个特定的的方法来收集这个元素的信息“,即:
- package com.desmond.xml.sax;
- import java.util.Stack;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.xml.sax.Attributes;
- public class SkippingTagTracker extends TagTracker {
- private static Log log = LogFactory.getLog(SkippingTagTracker.class);
- public void startElement(String namespaceURI,Stack tagStack) {
- log.debug("Skipping tag[" + localName + "]...");
- tagStack.push(this);
- }
- public void endElement(String namespaceURI,Stack tagStack) {
- log.debug("Finished skipping tag:[" + localName + "]");
- tagStack.pop();
- }
- }
完了,之后我们把当前这个元素的tracker压入栈中,如果这个元素没有子元素,那么它将会在endElement中被抛出栈顶。如果有的话,先处理它的子
- tracker.onStart(namespaceURI,attr);
元素,等所有的子元素都处理完了,才调用endElement结束这个元素(这个也是SAX处理元素的规则,这里只是用到了这一点而已)。
综上所述,整个事件的处理流程是:使用TagTracker追踪所需要的元素-------> 利用TagTracker的 track方法递归调用形成”元素树“-------> 利用这些
”元素树“去判断当前的元素是不是应该被解析-------> 不被解析就跳过,被解析,再去判断他的子元素。一种这样递归地完成真个解析过程。
附全部代码(SaxMapper.java/ SkippingTagTracker.java/ TagTracker.java/ TestMain.java,共四个类).
SaxMapper.java
- package com.desmond.xml.sax;
- import java.io.ByteArrayInputStream;
- import java.io.CharArrayWriter;
- import java.io.File;
- import java.io.IOException;
- import java.util.Stack;
- import org.apache.commons.configuration.Configuration;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.xml.sax.Attributes;
- import org.xml.sax.InputSource;
- import org.xml.sax.SAXException;
- import org.xml.sax.XMLReader;
- import org.xml.sax.helpers.DefaultHandler;
- import org.xml.sax.helpers.XMLReaderFactory;
- public class SaxMapper extends DefaultHandler{
- private static final Log log = LogFactory.getLog(SaxMapper.class);
- private String file = "";
- protected Stack<TagTracker> tagStack = new Stack<TagTracker>();
- protected XMLReader xr;
- protected CharArrayWriter contents = new CharArrayWriter();
- protected boolean parSEOnly;
- protected Configuration config;
- public SaxMapper() throws Exception {
- try {
- xr = XMLReaderFactory.createXMLReader();
- } catch (SAXException e) {
- e.printStackTrace();
- }
- log.info("Creating the tag tracker network.");
- tagStack.push(createTagTrackerNetwork());
- log.info("Tag Tracker network created.");
- }
- @Override
- public void startElement(String namespaceURI,Attributes attr) throws SAXException {
- contents.reset();
- TagTracker ativeTracker = (TagTracker) tagStack.peek();
- try {
- ativeTracker.startElement(namespaceURI,attr,tagStack);
- } catch(Exception e) {
- e.printStackTrace();
- throw new SAXException(e);
- }
- }
- @Override
- public void endElement(String namespaceURI,String qName)
- throws SAXException {
- TagTracker activeTracker = (TagTracker) tagStack.peek();
- try {
- activeTracker.endElement(namespaceURI,contents,tagStack);
- } catch(Exception e) {
- e.printStackTrace();
- throw new SAXException(e);
- }
- }
- @Override
- public void characters(char[] ch,int length)
- throws SAXException {
- contents.write(ch,start,length);
- }
- protected InputSource getSource(String fileName) throws IOException {
- File xmlFile = new File(fileName);
- byte[] xmlBytes = FileUtils.readFileToByteArray(xmlFile);
- return new InputSource(new ByteArrayInputStream(xmlBytes));
- }
- protected void parseXML() throws IOException,Exception {
- parse(getSource(getFileName()));
- }
- protected void parse(InputSource in) throws Exception{
- parSEOnly = true;
- xr.setContentHandler(this);
- log.info("start to parse...");
- xr.parse(in);
- log.info("end to parse...");
- }
- protected TagTracker createTagTrackerNetwork() {
- TagTracker root = new TagTracker();
- root.track("store",new BookTracker());
- return root;
- }
- protected String getFileName() {
- return file;
- }
- protected void setFileName(String fileName) {
- this.file = fileName;
- }
- private class StoreTracker extends TagTracker {
- public StoreTracker() {
- }
- @Override
- public void onStart(String namespaceURI,Attributes attr) throws Exception {
- }
- @Override
- public void onEnd(String namespaceURI,CharArrayWriter contents) {
- }
- }
- // get information of the element "type"
- private class TypeTracker extends TagTracker {
- public TypeTracker() {
- }
- @Override
- public void onStart(String namespaceURI,CharArrayWriter contents) {
- // get the characters data inside the element
- String text = contents.toString();
- // handle this text...
- }
- }
- private class BookStoreTracker extends TagTracker {
- public BookStoreTracker() {
- }
- }
- private class AddrTracker extends TagTracker {
- public AddrTracker() {
- }
- @Override
- public void onStart(String namespaceURI,CharArrayWriter contents) {
- }
- }
- private class BookTracker extends TagTracker {
- public BookTracker() {
- }
- @Override
- public void onStart(String namespaceURI,CharArrayWriter contents) {
- }
- }
- }
SkippingTagTracker.java
- package com.desmond.xml.sax;
- import java.util.Stack;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.xml.sax.Attributes;
- public class SkippingTagTracker extends TagTracker {
- private static Log log = LogFactory.getLog(SkippingTagTracker.class);
- public void startElement(String namespaceURI,Stack tagStack) {
- log.debug("Finished skipping tag:[" + localName + "]");
- tagStack.pop();
- }
- }
TagTracker
- package com.desmond.xml.sax;
- import java.io.CharArrayWriter;
- import java.util.Hashtable;
- import java.util.Stack;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.xml.sax.Attributes;
- public class TagTracker {
- private static Log log = LogFactory.getLog(TagTracker.class);
- private Hashtable<String,CharArrayWriter contents) {
- }
- public void onReactivate() throws Exception {
- }
- }
TestMain.java
- package com.desmond.xml.sax;
- public class TestMain {
- /**
- * @param args
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- SaxMapper mapper = new SaxMapper();
- if(args.length > 0) {
- mapper.setFileName(args[0]);
- mapper.parseXML();
- } else {
- System.out.println("no file configurated! please configurate it.");
- }
- }
- }