001/*
002Copyright (c) 2011+, HL7, Inc
003All rights reserved.
004
005Redistribution and use in source and binary forms, with or without modification, 
006are permitted provided that the following conditions are met:
007
008 * Redistributions of source code must retain the above copyright notice, this 
009   list of conditions and the following disclaimer.
010 * Redistributions in binary form must reproduce the above copyright notice, 
011   this list of conditions and the following disclaimer in the documentation 
012   and/or other materials provided with the distribution.
013 * Neither the name of HL7 nor the names of its contributors may be used to 
014   endorse or promote products derived from this software without specific 
015   prior written permission.
016
017THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
018ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
019WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
020IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
021INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 
022NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
023PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
024WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
025ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
026POSSIBILITY OF SUCH DAMAGE.
027
028*/
029package org.hl7.fhir.utilities.xhtml;
030
031import java.io.IOException;
032import java.io.InputStream;
033import java.io.InputStreamReader;
034import java.io.Reader;
035import java.io.StringReader;
036import java.util.ArrayList;
037import java.util.HashMap;
038import java.util.HashSet;
039import java.util.List;
040import java.util.Map;
041import java.util.Set;
042
043import org.hl7.fhir.exceptions.FHIRException;
044import org.hl7.fhir.exceptions.FHIRFormatError;
045import org.hl7.fhir.utilities.xhtml.XhtmlNode.Location;
046import org.w3c.dom.Attr;
047import org.w3c.dom.Element;
048import org.w3c.dom.Node;
049import org.xmlpull.v1.XmlPullParser;
050import org.xmlpull.v1.XmlPullParserException;
051
052public class XhtmlParser {
053  public static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
054
055  public class NSMap {
056    private Map<String, String> nslist = new HashMap<String, String>();
057
058    public NSMap(NSMap nsm) {
059      if (nsm != null)
060        nslist.putAll(nsm.nslist);
061    }
062
063    public void def(String ns) {
064      nslist.put("", ns);
065    }
066
067    public void ns(String abbrev, String ns) {
068      nslist.put(abbrev, ns);  
069    }
070
071    public String def() {
072      return nslist.get("");
073    }
074
075    public boolean hasDef() {
076      return nslist.containsKey("");
077    }
078
079    public String get(String abbrev) {
080      return nslist.containsKey(abbrev) ? nslist.get(abbrev) : "http://error/undefined-namespace";
081    }
082  }
083
084  public class QName {
085    private String ns;
086    private String name;
087
088    public QName(String src) {
089      if (src.contains(":")) {
090        ns = src.substring(0, src.indexOf(":"));
091        name = src.substring(src.indexOf(":")+1);
092      } else {
093        ns = null;
094        name = src;
095      }
096    }
097
098    public String getName() {
099      return name;
100    }
101
102    public boolean hasNs() {
103      return ns != null;
104    }
105
106    public String getNs() {
107      return ns;
108    }
109
110    @Override
111    public String toString() {
112      return ns+"::"+name;
113    }
114    
115  }
116
117  private Set<String> elements = new HashSet<String>();
118  private Set<String> attributes = new HashSet<String>();  
119  
120  
121  public XhtmlParser() {
122  super();
123  policy = ParserSecurityPolicy.Accept; // for general parsing
124  
125  // set up sets
126  elements.add("p");
127  elements.add("br");
128  elements.add("div");
129  elements.add("h1");
130  elements.add("h2");
131  elements.add("h3");
132  elements.add("h4");
133  elements.add("h5");
134  elements.add("h6");
135  elements.add("a");
136  elements.add("span");
137  elements.add("b");
138  elements.add("em");
139  elements.add("i");
140  elements.add("strong");
141  elements.add("small");
142  elements.add("big");
143  elements.add("tt");
144  elements.add("small");
145  elements.add("dfn");
146  elements.add("q");
147  elements.add("var");
148  elements.add("abbr");
149  elements.add("acronym");
150  elements.add("cite");
151  elements.add("blockquote");
152  elements.add("hr");
153  elements.add("address");
154  elements.add("bdo");
155  elements.add("kbd");
156  elements.add("q");
157  elements.add("sub");
158  elements.add("sup");
159  elements.add("ul");
160  elements.add("ol");
161  elements.add("li");
162  elements.add("dl");
163  elements.add("dt");
164  elements.add("dd");
165  elements.add("pre");
166  elements.add("table");
167  elements.add("caption");
168  elements.add("colgroup");
169  elements.add("col");
170  elements.add("thead");
171  elements.add("tr");
172  elements.add("tfoot");
173  elements.add("tbody");
174  elements.add("th");
175  elements.add("td");
176  elements.add("code");
177  elements.add("samp");
178  elements.add("img");
179  elements.add("map");
180  elements.add("area");
181  
182  attributes.add("title");
183  attributes.add("style");
184  attributes.add("class");
185  attributes.add("id");
186  attributes.add("lang");
187  attributes.add("xml:lang");
188  attributes.add("dir");
189  attributes.add("accesskey");
190  attributes.add("tabindex");
191    // tables:
192  attributes.add("span");
193  attributes.add("width");
194  attributes.add("align");
195  attributes.add("valign");
196  attributes.add("char");
197  attributes.add("charoff");
198  attributes.add("abbr");
199  attributes.add("axis");
200  attributes.add("headers");
201  attributes.add("scope");
202  attributes.add("rowspan");
203  attributes.add("colspan");
204
205  attributes.add("a.href");
206  attributes.add("a.name");
207  attributes.add("img.src");
208  attributes.add("img.border");
209  attributes.add("div.xmlns");
210  attributes.add("blockquote.cite");
211  attributes.add("q.cite");
212  attributes.add("a.charset");
213  attributes.add("a.type");
214  attributes.add("a.name");
215  attributes.add("a.href");
216  attributes.add("a.hreflang");
217  attributes.add("a.rel");
218  attributes.add("a.rev");
219  attributes.add("a.shape");
220  attributes.add("a.coords");
221  attributes.add("img.src");
222  attributes.add("img.alt");
223  attributes.add("img.longdesc");
224  attributes.add("img.height");
225  attributes.add("img.width");
226  attributes.add("img.usemap");
227  attributes.add("img.ismap");
228  attributes.add("map.name");
229  attributes.add("area.shape");
230  attributes.add("area.coords");
231  attributes.add("area.href");
232  attributes.add("area.nohref");
233  attributes.add("area.alt");
234  attributes.add("table.summary");
235  attributes.add("table.width");
236  attributes.add("table.border");
237  attributes.add("table.frame");
238  attributes.add("table.rules");
239  attributes.add("table.cellspacing");
240  attributes.add("table.cellpadding");
241}
242
243public enum ParserSecurityPolicy {
244    Accept,
245    Drop,
246    Reject
247  }
248
249  private ParserSecurityPolicy policy;
250  
251  private boolean trimWhitespace;
252  private boolean mustBeWellFormed = true;
253  private boolean validatorMode;
254  
255  public boolean isTrimWhitespace() {
256    return trimWhitespace;
257  }
258
259  public void setTrimWhitespace(boolean trimWhitespace) {
260    this.trimWhitespace = trimWhitespace;
261  }
262
263  public boolean isMustBeWellFormed() {
264    return mustBeWellFormed;
265  }
266
267  public XhtmlParser setMustBeWellFormed(boolean mustBeWellFormed) {
268    this.mustBeWellFormed = mustBeWellFormed;
269    return this;
270  }
271  
272
273  public boolean isValidatorMode() {
274    return validatorMode;
275  }
276
277  public XhtmlParser setValidatorMode(boolean validatorMode) {
278    this.validatorMode = validatorMode;
279    return this;
280  }
281
282  public ParserSecurityPolicy getPolicy() {
283   return policy;
284  }
285
286  public void setPolicy(ParserSecurityPolicy policy) {
287  this.policy = policy; 
288  }
289
290  public XhtmlNode parseHtmlNode(Element node) throws FHIRFormatError  {
291    return parseHtmlNode(node, null);
292  }
293  
294  public XhtmlNode parseHtmlNode(Element node, String defaultNS) throws FHIRFormatError  {
295    XhtmlNode res = parseNode(node, defaultNS);
296    if (res.getNsDecl() == null)
297      res.getAttributes().put("xmlns", XHTML_NS);
298    return res;
299  }
300
301  private XhtmlNode parseNode(Element node, String defaultNS) throws FHIRFormatError  {
302    XhtmlNode res = new XhtmlNode(NodeType.Element);
303    res.setName(node.getLocalName());
304    defaultNS = checkNS(res, node, defaultNS);
305    for (int i = 0; i < node.getAttributes().getLength(); i++) {
306      Attr attr = (Attr) node.getAttributes().item(i);
307      if (attributeIsOk(res.getName(), attr.getName(), attr.getValue()) && !attr.getLocalName().startsWith("xmlns"))
308        res.getAttributes().put(attr.getName(), attr.getValue());
309    }
310    Node child = node.getFirstChild();
311    while (child != null) {
312      if (child.getNodeType() == Node.TEXT_NODE) {
313        res.addText(child.getTextContent());
314      } else if (child.getNodeType() == Node.COMMENT_NODE) {
315        res.addComment(child.getTextContent());
316      } else if (child.getNodeType() == Node.ELEMENT_NODE) {
317        if (elementIsOk(child.getLocalName()))
318          res.getChildNodes().add(parseNode((Element) child, defaultNS));
319      } else
320        throw new FHIRFormatError("Unhandled XHTML feature: "+Integer.toString(child.getNodeType())+descLoc());
321      child = child.getNextSibling();
322    }
323    return res;
324  }  
325
326  private String checkNS(XhtmlNode res, Element node, String defaultNS) {
327    if (!validatorMode)
328      return null;
329    String ns = node.getNamespaceURI();
330    if (ns == null)
331      return null;
332    if (!ns.equals(defaultNS)) {
333      res.getAttributes().put("xmlns", ns);
334      return ns;
335    }
336    return defaultNS;
337  }
338
339  public XhtmlNode parseHtmlNode(XmlPullParser xpp) throws XmlPullParserException, IOException, FHIRFormatError  {
340    XhtmlNode res = parseNode(xpp);
341    if (res.getNsDecl() == null)
342      res.getAttributes().put("xmlns", XHTML_NS);
343    return res;
344
345  }
346  private XhtmlNode parseNode(XmlPullParser xpp) throws XmlPullParserException, IOException, FHIRFormatError  {
347    XhtmlNode res = new XhtmlNode(NodeType.Element);
348    res.setName(xpp.getName());
349    
350    for (int i = 0; i < xpp.getAttributeCount(); i++) {
351      if (attributeIsOk(xpp.getName(), xpp.getAttributeName(i), xpp.getAttributeValue(i)))
352      res.getAttributes().put(xpp.getAttributeName(i), xpp.getAttributeValue(i));
353    }
354    int eventType = xpp.next();
355    while (eventType != XmlPullParser.END_TAG) {
356      if (eventType == XmlPullParser.TEXT) {
357        res.addText(xpp.getText());
358        xpp.next();
359      } else if (eventType == XmlPullParser.COMMENT) {
360        res.addComment(xpp.getText());
361        xpp.next();
362      } else if (eventType == XmlPullParser.START_TAG) {
363        if (elementIsOk(xpp.getName()))
364          res.getChildNodes().add(parseNode(xpp));
365      } else
366        throw new FHIRFormatError("Unhandled XHTML feature: "+Integer.toString(eventType)+descLoc());
367      eventType = xpp.getEventType();
368    }
369    xpp.next();
370    return res;
371  }  
372
373  private boolean attributeIsOk(String elem, String attr, String value) throws FHIRFormatError  {
374    if (validatorMode)
375      return true;
376  boolean ok = attributes.contains(attr) || attributes.contains(elem+"."+attr);
377  if (ok)
378    return true;
379  else switch (policy) {
380    case Accept: return true;
381    case Drop: return false;
382    case Reject: throw new FHIRFormatError("Illegal HTML attribute "+elem+"."+attr);
383  }
384
385  if ((elem+"."+attr).equals("img.src") && !(value.startsWith("#") || value.startsWith("http:") || value.startsWith("https:"))) {
386    switch (policy) {
387      case Accept: return true;
388      case Drop: return false;
389      case Reject: throw new FHIRFormatError("Illegal Image Reference "+value);
390    }
391  }
392  return false;
393  }
394
395private boolean elementIsOk(String name) throws FHIRFormatError  {
396    if (validatorMode)
397      return true;
398    boolean ok = elements.contains(name);
399  if (ok)
400      return true;
401  else switch (policy) {
402    case Accept: return true;
403    case Drop: return false;
404    case Reject: throw new FHIRFormatError("Illegal HTML element "+name);
405  }
406  return false;
407}
408
409  private String descLoc() {
410    return " at line "+Integer.toString(line)+" column "+Integer.toString(col);
411  }
412
413  private Reader rdr;
414  private String cache = "";
415  private XhtmlNode unwindPoint;
416  private String lastText = "";
417  private int line = 1;
418  private int col = 0;
419  private char lastChar;
420  private Location lastLoc;
421  
422  public XhtmlDocument parse(String source, String entryName) throws FHIRFormatError, IOException  {
423    rdr = new StringReader(source);
424    return parse(entryName);
425  }
426  
427  public XhtmlDocument parse(InputStream input, String entryName) throws FHIRFormatError, IOException  {
428    rdr = new InputStreamReader(input, "UTF-8");
429    return parse(entryName);
430  }
431   
432  private XhtmlDocument parse(String entryName) throws FHIRFormatError, IOException 
433  {
434    XhtmlDocument result = new XhtmlDocument();
435    skipWhiteSpaceAndComments(result);
436    if (peekChar() != '<')
437      throw new FHIRFormatError("Unable to Parse HTML - does not start with tag. Found "+peekChar()+descLoc());
438    readChar();
439    markLocation();
440    QName n = new QName(readName().toLowerCase());
441    if ((entryName != null) && !n.getName().equals(entryName))
442      throw new FHIRFormatError("Unable to Parse HTML - starts with '"+n+"' not '"+entryName+"'"+descLoc());
443    XhtmlNode root = result.addTag(n.getName());
444    root.setLocation(markLocation());
445    parseAttributes(root);
446    markLocation();
447    NSMap nsm = checkNamespaces(n, root, null, true);
448    if (readChar() == '/') {
449      if (peekChar() != '>')
450        throw new FHIRFormatError("unexpected non-end of element "+n+" "+descLoc());
451      readChar();
452    } else {
453      unwindPoint = null;
454      List<XhtmlNode> p = new ArrayList<XhtmlNode>();
455      parseElementInner(root, p, nsm, true);
456    }
457    return result;
458  }
459  
460  private Location markLocation() {
461    Location res = lastLoc;
462    lastLoc = new Location(line, col);
463    return res;
464  }
465
466  private NSMap checkNamespaces(QName n, XhtmlNode node, NSMap nsm, boolean root) {
467    // what we do here is strip out any stated namespace attributes, putting them in the namesapce map
468    // then we figure out what the namespace of this element is, and state it explicitly if it's not the default
469    
470    // but we don't bother with any of this if we're not validating
471    if (!validatorMode)
472      return null;
473    NSMap result = new NSMap(nsm);
474    List<String> nsattrs = new ArrayList<String>();
475    for (String an : node.getAttributes().keySet()) {
476      if (an.equals("xmlns")) {
477        result.def(node.getAttribute(an));
478        nsattrs.add(an);
479      }
480      if (an.startsWith("xmlns:")) {
481        result.ns(an.substring(6), node.getAttribute(an));
482        nsattrs.add(an);
483      }
484    }
485    for (String s : nsattrs)
486      node.getAttributes().remove(s);
487    if (n.hasNs()) {
488      String nns = result.get(n.getNs());
489      if (!nns.equals(result.def())) {
490        node.getAttributes().put("xmlns", nns);
491        result.def(nns);
492      }
493    } else if (root && result.hasDef()) {
494      node.getAttributes().put("xmlns", result.def());
495    }
496    return result;
497  }
498
499  private void addTextNode(XhtmlNode node, StringBuilder s)
500  {
501    String t = isTrimWhitespace() ? s.toString().trim() : s.toString();
502    if (t.length() > 0)
503    {
504      lastText = t;
505      // System.out.println(t);
506      node.addText(t).setLocation(markLocation());
507      s.setLength(0);
508    }
509  }
510  private void parseElementInner(XhtmlNode node, List<XhtmlNode> parents, NSMap nsm, boolean escaping) throws FHIRFormatError, IOException 
511  {
512    StringBuilder s = new StringBuilder();
513    while (peekChar() != '\0' && !parents.contains(unwindPoint) && !(node == unwindPoint))
514    {
515      if (peekChar() == '<')
516      {
517        addTextNode(node, s);
518        readChar();
519        if (peekChar() == '!') {
520          String sc = readToCommentEnd();
521          if (sc.startsWith("DOCTYPE"))
522            throw new FHIRFormatError("Malformed XHTML: Found a DocType declaration, and these are not allowed (XXE security vulnerability protection)");
523          node.addComment(sc).setLocation(markLocation());
524        } else if (peekChar() == '?')
525          node.addComment(readToTagEnd()).setLocation(markLocation());
526        else if (peekChar() == '/') {
527          readChar();
528          QName n = new QName(readToTagEnd());
529          if (node.getName().equals(n.getName()))
530            return;
531          else
532          {
533            if (mustBeWellFormed)
534              throw new FHIRFormatError("Malformed XHTML: Found \"</"+n.getName()+">\" expecting \"</"+node.getName()+">\""+descLoc());
535            for (int i = parents.size() - 1; i >= 0; i--)
536            {
537              if (parents.get(i).getName().equals(n))
538                unwindPoint = parents.get(i);
539            }
540            if (unwindPoint != null)
541            {
542              for (int i = parents.size(); i > 0; i--)
543              {
544                if (i < parents.size() && parents.get(i) == unwindPoint)
545                  return;
546                if (i == parents.size())
547                {
548                  parents.get(i - 1).getChildNodes().addAll(node.getChildNodes());
549                  node.getChildNodes().clear();
550                }
551                else
552                {
553                  parents.get(i - 1).getChildNodes().addAll(parents.get(i).getChildNodes());
554                  parents.get(i).getChildNodes().clear();
555                }
556              }
557            }
558          }
559        }
560        else if (Character.isLetterOrDigit(peekChar()))
561        {
562          parseElement(node, parents, nsm);
563        }
564        else
565          throw new FHIRFormatError("Unable to Parse HTML - node '" + node.getName() + "' has unexpected content '"+peekChar()+"' (last text = '"+lastText+"'"+descLoc());
566      }
567      else if (peekChar() == '&') // escaping && 
568      {
569        parseLiteral(s);
570      }
571      else
572        s.append(readChar());
573    }
574    addTextNode(node, s);
575  }
576
577  private void parseElement(XhtmlNode parent, List<XhtmlNode> parents, NSMap nsm) throws IOException, FHIRFormatError 
578  {
579    markLocation();
580    QName name = new QName(readName());
581    XhtmlNode node = parent.addTag(name.getName());
582    node.setLocation(markLocation());
583    List<XhtmlNode> newParents = new ArrayList<XhtmlNode>();
584    newParents.addAll(parents);
585    newParents.add(parent);
586    parseAttributes(node);
587    markLocation();
588    nsm = checkNamespaces(name, node, nsm, false);
589    if (readChar() == '/') {
590      if (peekChar() != '>')
591        throw new FHIRFormatError("unexpected non-end of element "+name+" "+descLoc());
592      readChar();
593    } else {
594       parseElementInner(node, newParents, nsm, "script".equals(name.getName()));
595    }
596  }
597  
598  private void parseAttributes(XhtmlNode node) throws FHIRFormatError, IOException 
599  {
600    while (Character.isWhitespace(peekChar()))
601      readChar();
602    while (peekChar() != '>' && peekChar() != '/' && peekChar() != '\0')
603    {
604      String name = readName();
605      if (name.length() == 0)
606      {
607        throw new FHIRFormatError("Unable to read attribute on <"+node.getName()+">"+descLoc());
608      }
609      while (Character.isWhitespace(peekChar()))
610        readChar();
611
612      if (isNameChar(peekChar()) || peekChar() == '>' || peekChar() == '/')
613        node.getAttributes().put(name, null);
614      else if (peekChar() != '=')
615      {
616        throw new FHIRFormatError("Unable to read attribute '"+name+"' value on <"+node.getName()+">"+descLoc());
617      }
618      else
619      {
620        readChar();
621        while (Character.isWhitespace(peekChar()))
622          readChar();
623        if (peekChar() == '"' || peekChar() == '\'')
624          node.getAttributes().put(name, parseAttributeValue(readChar()));
625        else
626          node.getAttributes().put(name, parseAttributeValue('\0'));
627      }
628      while (Character.isWhitespace(peekChar()))
629        readChar();
630    }
631  }
632
633  private String parseAttributeValue(char term) throws IOException, FHIRFormatError 
634  {
635    StringBuilder b = new StringBuilder();
636    while (peekChar() != '\0' && peekChar() != '>' && (term != '\0' || peekChar() != '/') && peekChar() != term)
637    {
638      if (peekChar() == '&')
639      {
640        parseLiteral(b);
641      }
642      else
643        b.append(readChar());
644    }
645    if (peekChar() == term)
646      readChar();
647    return b.toString();
648  }
649
650  
651  private void skipWhiteSpaceAndComments(XhtmlNode focus) throws IOException, FHIRFormatError  {
652    while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff))
653      readChar();
654    if (peekChar() == '<')
655    {
656      char ch = readChar();
657      if (peekChar() == '!') {
658        readChar();
659        if (peekChar() == '-') {
660          readChar();
661          if (peekChar() == '-') {
662            readChar();
663            if (peekChar() == ' ')
664              readChar();
665            focus.addComment(readToCommentEnd());
666          } else 
667            throw new FHIRFormatError("unrecognised element type <!"+peekChar()+descLoc());
668        } else
669          focus.addDocType(readToDocTypeEnd());
670        skipWhiteSpaceAndComments(focus);
671      } else if (peekChar() == '?') {
672        String r = readToTagEnd();
673        focus.addInstruction(r.substring(1, r.length()-1));
674        skipWhiteSpaceAndComments(focus);
675      } else 
676        pushChar(ch);
677    }
678  }  
679  
680  private void skipWhiteSpace() throws IOException {
681    if (trimWhitespace)
682      while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff))
683        readChar();
684  }
685  
686  private void pushChar(char ch) {
687    cache = Character.toString(ch)+cache;    
688  }
689
690  private char peekChar() throws IOException
691  {
692    if (cache.length() > 0)
693      return cache.charAt(0);
694    else if (!rdr.ready())
695      return '\0';
696    else
697    {
698      char c = (char)rdr.read();
699      if (c == (char)-1)
700      {
701        cache = "";
702        return '\0';
703      }
704      cache =  Character.toString(c);
705      return c;
706    }
707  }
708
709  private char readChar() throws IOException
710  {
711    char c;
712    if (cache.length() > 0)
713    {
714      c = cache.charAt(0);
715      cache = cache.length() == 1 ? "" : cache.substring(1);
716    }
717    else if (!rdr.ready())
718      c = '\0';
719    else
720      c = (char)rdr.read();
721    if (c == '\r' || c == '\n') {
722      if (c == '\r' || lastChar != '\r') {
723        line++;
724        col = 0;
725      }  
726      lastChar = c;      
727    }      
728    col++;
729    return c;
730  }
731
732  private String readToTagEnd() throws IOException, FHIRFormatError 
733  {
734    StringBuilder s = new StringBuilder();
735    while (peekChar() != '>' && peekChar() != '\0')
736      s.append(readChar());
737    if (peekChar() != '\0')
738    {
739      readChar();
740      skipWhiteSpace();
741    } else if (mustBeWellFormed)
742      throw new FHIRFormatError("Unexpected termination of html source"+descLoc());
743    return s.toString();
744  }
745
746  private String readToDocTypeEnd() throws IOException, FHIRFormatError 
747  {
748    StringBuilder s = new StringBuilder();
749      
750    boolean done = false;
751    while (!done) {
752      char c = peekChar();
753      if (c == '>') {
754        done = true;
755        readChar();
756      } else if (c != '\0')
757        s.append(readChar());
758      else if (mustBeWellFormed)
759        throw new FHIRFormatError("Unexpected termination of html source"+descLoc());
760    }
761    return s.toString();
762  }
763
764  private String readToCommentEnd() throws IOException, FHIRFormatError 
765  {
766    if (peekChar() == '!')
767      readChar();
768    StringBuilder s = new StringBuilder();
769      
770    boolean simple = true;
771    if (peekChar() == '-') {
772      readChar();
773      simple = peekChar() != '-';
774      if (simple)
775        s.append('-');
776      else
777        readChar();
778    }
779    
780    boolean done = false;
781    while (!done) {
782      char c = peekChar();
783      if (c == '-') {
784        readChar();
785        if (peekChar() == '-') {
786          readChar();
787          if (peekChar() == '>') {
788            done = true;
789          } else
790            s.append("--");
791        } else 
792          s.append('-');
793      } else if (simple && peekChar() == '>') {
794        done = true;
795      } else if (c != '\0')
796        s.append(readChar());
797      else if (mustBeWellFormed)
798        throw new FHIRFormatError("Unexpected termination of html source"+descLoc());
799    }
800    if (peekChar() != '\0')
801    {
802      readChar();
803      skipWhiteSpace();
804    }
805    return s.toString();
806  }
807
808  private boolean isNameChar(char ch)
809  {
810    return Character.isLetterOrDigit(ch) || ch == '_' || ch == '-' || ch == ':';
811  }
812
813  private String readName() throws IOException
814  {
815    StringBuilder s = new StringBuilder();
816    while (isNameChar(peekChar()))
817      s.append(readChar());
818    return s.toString();
819  }
820
821  private String readUntil(char ch) throws IOException
822  {
823    StringBuilder s = new StringBuilder();
824    while (peekChar() != 0 && peekChar() != ch)
825      s.append(readChar());
826    readChar();
827    return s.toString();
828  }
829  
830  private void parseLiteral(StringBuilder s) throws IOException, FHIRFormatError {
831    // UInt16 w;
832    readChar();
833    String c = readUntil(';');
834    if (c.equals("apos"))
835      s.append('\'');
836    else if (c.equals("quot"))
837      s.append('"');
838    else if (c.equals("nbsp"))
839      s.append(XhtmlNode.NBSP);
840    else if (c.equals("amp"))
841      s.append('&');
842    else if (c.equals("lsquo"))
843      s.append((char) 8216); // right single quotation, U+2019 ISOnum 
844    else if (c.equals("rsquo"))
845      s.append((char) 8217); // right single quotation, U+2019 ISOnum 
846      //s.append((char)0x60); // right single quote
847          //s.append('’');
848    else if (c.equals("gt"))
849      s.append('>');
850    else if (c.equals("lt"))
851      s.append('<');
852    else if (c.equals("copy"))
853      s.append((char) 169);
854    else if (c.equals("reg"))
855      s.append((char) 174);
856    else if (c.equals("sect"))
857      s.append((char) 0xA7);
858    else if (c.charAt(0) == '#') {
859      if (isInteger(c.substring(1), 10))
860        s.append((char) Integer.parseInt(c.substring(1)));
861      else if (c.charAt(1) == 'x' && isInteger(c.substring(2), 16))
862        s.append((char) Integer.parseInt(c.substring(2), 16));
863    } else if (c.equals("fnof"))
864      s.append((char) 402); // latin small f with hook = function = florin, U+0192 ISOtech -->
865    else if (c.equals("Alpha"))
866      s.append((char) 913); // greek capital letter alpha, U+0391
867    else if (c.equals("Beta"))
868      s.append((char) 914); // greek capital letter beta, U+0392
869    else if (c.equals("Gamma"))
870      s.append((char) 915); // greek capital letter gamma, U+0393 ISOgrk3
871    else if (c.equals("Delta"))
872      s.append((char) 916); // greek capital letter delta, U+0394 ISOgrk3
873    else if (c.equals("Epsilon"))
874      s.append((char) 917); // greek capital letter epsilon, U+0395
875    else if (c.equals("Zeta"))
876      s.append((char) 918); // greek capital letter zeta, U+0396
877    else if (c.equals("Eta"))
878      s.append((char) 919); // greek capital letter eta, U+0397
879    else if (c.equals("Theta"))
880      s.append((char) 920); // greek capital letter theta, U+0398 ISOgrk3
881    else if (c.equals("Iota"))
882      s.append((char) 921); // greek capital letter iota, U+0399
883    else if (c.equals("Kappa"))
884      s.append((char) 922); // greek capital letter kappa, U+039A
885    else if (c.equals("Lambda"))
886      s.append((char) 923); // greek capital letter lambda, U+039B ISOgrk3
887    else if (c.equals("Mu"))
888      s.append((char) 924); // greek capital letter mu, U+039C
889    else if (c.equals("Nu"))
890      s.append((char) 925); // greek capital letter nu, U+039D
891    else if (c.equals("Xi"))
892      s.append((char) 926); // greek capital letter xi, U+039E ISOgrk3
893    else if (c.equals("Omicron"))
894      s.append((char) 927); // greek capital letter omicron, U+039F
895    else if (c.equals("Pi"))
896      s.append((char) 928); // greek capital letter pi, U+03A0 ISOgrk3
897    else if (c.equals("Rho"))
898      s.append((char) 929); // greek capital letter rho, U+03A1
899    else if (c.equals("Sigma"))
900      s.append((char) 931); // greek capital letter sigma, U+03A3 ISOgrk3
901    else if (c.equals("Tau"))
902      s.append((char) 932); // greek capital letter tau, U+03A4
903    else if (c.equals("Upsilon"))
904      s.append((char) 933); // greek capital letter upsilon, U+03A5 ISOgrk3
905    else if (c.equals("Phi"))
906      s.append((char) 934); // greek capital letter phi, U+03A6 ISOgrk3
907    else if (c.equals("Chi"))
908      s.append((char) 935); // greek capital letter chi, U+03A7
909    else if (c.equals("Psi"))
910      s.append((char) 936); // greek capital letter psi, U+03A8 ISOgrk3
911    else if (c.equals("Omega"))
912      s.append((char) 937); // greek capital letter omega, U+03A9 ISOgrk3
913    else if (c.equals("alpha"))
914      s.append((char) 945); // greek small letter alpha, U+03B1 ISOgrk3
915    else if (c.equals("beta"))
916      s.append((char) 946); // greek small letter beta, U+03B2 ISOgrk3
917    else if (c.equals("gamma"))
918      s.append((char) 947); // greek small letter gamma, U+03B3 ISOgrk3
919    else if (c.equals("delta"))
920      s.append((char) 948); // greek small letter delta, U+03B4 ISOgrk3
921    else if (c.equals("epsilon"))
922      s.append((char) 949); // greek small letter epsilon, U+03B5 ISOgrk3
923    else if (c.equals("zeta"))
924      s.append((char) 950); // greek small letter zeta, U+03B6 ISOgrk3
925    else if (c.equals("eta"))
926      s.append((char) 951); // greek small letter eta, U+03B7 ISOgrk3
927    else if (c.equals("theta"))
928      s.append((char) 952); // greek small letter theta, U+03B8 ISOgrk3
929    else if (c.equals("iota"))
930      s.append((char) 953); // greek small letter iota, U+03B9 ISOgrk3
931    else if (c.equals("kappa"))
932      s.append((char) 954); // greek small letter kappa, U+03BA ISOgrk3
933    else if (c.equals("lambda"))
934      s.append((char) 955); // greek small letter lambda, U+03BB ISOgrk3
935    else if (c.equals("mu"))
936      s.append((char) 956); // greek small letter mu, U+03BC ISOgrk3
937    else if (c.equals("nu"))
938      s.append((char) 957); // greek small letter nu, U+03BD ISOgrk3
939    else if (c.equals("xi"))
940      s.append((char) 958); // greek small letter xi, U+03BE ISOgrk3
941    else if (c.equals("omicron"))
942      s.append((char) 959); // greek small letter omicron, U+03BF NEW
943    else if (c.equals("pi"))
944      s.append((char) 960); // greek small letter pi, U+03C0 ISOgrk3
945    else if (c.equals("rho"))
946      s.append((char) 961); // greek small letter rho, U+03C1 ISOgrk3
947    else if (c.equals("sigmaf"))
948      s.append((char) 962); // greek small letter final sigma, U+03C2 ISOgrk3
949    else if (c.equals("sigma"))
950      s.append((char) 963); // greek small letter sigma, U+03C3 ISOgrk3
951    else if (c.equals("tau"))
952      s.append((char) 964); // greek small letter tau, U+03C4 ISOgrk3
953    else if (c.equals("upsilon"))
954      s.append((char) 965); // greek small letter upsilon, U+03C5 ISOgrk3
955    else if (c.equals("phi"))
956      s.append((char) 966); // greek small letter phi, U+03C6 ISOgrk3
957    else if (c.equals("chi"))
958      s.append((char) 967); // greek small letter chi, U+03C7 ISOgrk3
959    else if (c.equals("psi"))
960      s.append((char) 968); // greek small letter psi, U+03C8 ISOgrk3
961    else if (c.equals("omega"))
962      s.append((char) 969); // greek small letter omega, U+03C9 ISOgrk3
963    else if (c.equals("thetasym"))
964      s.append((char) 977); // greek small letter theta symbol, U+03D1 NEW
965    else if (c.equals("upsih"))
966      s.append((char) 978); // greek upsilon with hook symbol, U+03D2 NEW
967    else if (c.equals("piv"))
968      s.append((char) 982); // greek pi symbol, U+03D6 ISOgrk3
969    else if (c.equals("bull"))
970      s.append((char) 8226); // bullet = black small circle, U+2022 ISOpub
971    else if (c.equals("hellip"))
972      s.append((char) 8230); // horizontal ellipsis = three dot leader, U+2026 ISOpub
973    else if (c.equals("prime"))
974      s.append((char) 8242); // prime = minutes = feet, U+2032 ISOtech
975    else if (c.equals("Prime"))
976      s.append((char) 8243); // double prime = seconds = inches, U+2033 ISOtech
977    else if (c.equals("oline"))
978      s.append((char) 8254); // overline = spacing overscore, U+203E NEW
979    else if (c.equals("frasl"))
980      s.append((char) 8260); // fraction slash, U+2044 NEW
981    else if (c.equals("weierp"))
982      s.append((char) 8472); // script capital P = power set = Weierstrass p, U+2118 ISOamso
983    else if (c.equals("image"))
984      s.append((char) 8465); // blackletter capital I = imaginary part, U+2111 ISOamso
985    else if (c.equals("real"))
986      s.append((char) 8476); // blackletter capital R = real part symbol, U+211C ISOamso
987    else if (c.equals("trade"))
988      s.append((char) 8482); // trade mark sign, U+2122 ISOnum
989    else if (c.equals("alefsym"))
990      s.append((char) 8501); // alef symbol = first transfinite cardinal, U+2135 NEW
991    else if (c.equals("larr"))
992      s.append((char) 8592); // leftwards arrow, U+2190 ISOnum
993    else if (c.equals("uarr"))
994      s.append((char) 8593); // upwards arrow, U+2191 ISOnum
995    else if (c.equals("rarr"))
996      s.append((char) 8594); // rightwards arrow, U+2192 ISOnum
997    else if (c.equals("darr"))
998      s.append((char) 8595); // downwards arrow, U+2193 ISOnum
999    else if (c.equals("harr"))
1000      s.append((char) 8596); // left right arrow, U+2194 ISOamsa
1001    else if (c.equals("crarr"))
1002      s.append((char) 8629); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
1003    else if (c.equals("lArr"))
1004      s.append((char) 8656); // leftwards double arrow, U+21D0 ISOtech
1005    else if (c.equals("uArr"))
1006      s.append((char) 8657); // upwards double arrow, U+21D1 ISOamsa
1007    else if (c.equals("rArr"))
1008      s.append((char) 8658); // rightwards double arrow, U+21D2 ISOtech
1009    else if (c.equals("dArr"))
1010      s.append((char) 8659); // downwards double arrow, U+21D3 ISOamsa
1011    else if (c.equals("hArr"))
1012      s.append((char) 8660); // left right double arrow, U+21D4 ISOamsa
1013    else if (c.equals("forall"))
1014      s.append((char) 8704); // for all, U+2200 ISOtech
1015    else if (c.equals("part"))
1016      s.append((char) 8706); // partial differential, U+2202 ISOtech
1017    else if (c.equals("exist"))
1018      s.append((char) 8707); // there exists, U+2203 ISOtech
1019    else if (c.equals("empty"))
1020      s.append((char) 8709); // empty set = null set = diameter, U+2205 ISOamso
1021    else if (c.equals("nabla"))
1022      s.append((char) 8711); // nabla = backward difference, U+2207 ISOtech
1023    else if (c.equals("isin"))
1024      s.append((char) 8712); // element of, U+2208 ISOtech
1025    else if (c.equals("notin"))
1026      s.append((char) 8713); // not an element of, U+2209 ISOtech
1027    else if (c.equals("ni"))
1028      s.append((char) 8715); // contains as member, U+220B ISOtech
1029    else if (c.equals("prod"))
1030      s.append((char) 8719); // n-ary product = product sign, U+220F ISOamsb
1031    else if (c.equals("sum"))
1032      s.append((char) 8721); // n-ary sumation, U+2211 ISOamsb
1033    else if (c.equals("minus"))
1034      s.append((char) 8722); // minus sign, U+2212 ISOtech
1035    else if (c.equals("lowast"))
1036      s.append((char) 8727); // asterisk operator, U+2217 ISOtech
1037    else if (c.equals("radic"))
1038      s.append((char) 8730); // square root = radical sign, U+221A ISOtech
1039    else if (c.equals("prop"))
1040      s.append((char) 8733); // proportional to, U+221D ISOtech
1041    else if (c.equals("infin"))
1042      s.append((char) 8734); // infinity, U+221E ISOtech -->
1043    else if (c.equals("ang"))
1044      s.append((char) 8736); // angle, U+2220 ISOamso
1045    else if (c.equals("and"))
1046      s.append((char) 8743); // logical and = wedge, U+2227 ISOtech
1047    else if (c.equals("or"))
1048      s.append((char) 8744); // logical or = vee, U+2228 ISOtech
1049    else if (c.equals("cap"))
1050      s.append((char) 8745); // intersection = cap, U+2229 ISOtech
1051    else if (c.equals("cup"))
1052      s.append((char) 8746); // union = cup, U+222A ISOtech
1053    else if (c.equals("int"))
1054      s.append((char) 8747); // integral, U+222B ISOtech
1055    else if (c.equals("there4"))
1056      s.append((char) 8756); // therefore, U+2234 ISOtech
1057    else if (c.equals("sim"))
1058      s.append((char) 8764); // tilde operator = varies with = similar t U+223C ISOtech
1059    else if (c.equals("cong"))
1060      s.append((char) 8773); // approximately equal to, U+2245 ISOtec
1061    else if (c.equals("asymp"))
1062      s.append((char) 8776); // almost equal to = asymptotic to, U+2248 ISOamsr
1063    else if (c.equals("ne"))
1064      s.append((char) 8800); // not equal to, U+2260 ISOtech
1065    else if (c.equals("equiv"))
1066      s.append((char) 8801); // identical to, U+2261 ISOtech
1067    else if (c.equals("le"))
1068      s.append((char) 8804); // less-than or equal to, U+2264 ISOtech
1069    else if (c.equals("ge"))
1070      s.append((char) 8805); // greater-than or equal to, U+2265 ISOtech
1071    else if (c.equals("sub"))
1072      s.append((char) 8834); // subset of, U+2282 ISOtech
1073    else if (c.equals("sup"))
1074      s.append((char) 8835); // superset of, U+2283 ISOtech
1075    else if (c.equals("nsub"))
1076      s.append((char) 8836); // not a subset of, U+2284 ISOamsn
1077    else if (c.equals("sube"))
1078      s.append((char) 8838); // subset of or equal to, U+2286 ISOtech
1079    else if (c.equals("supe"))
1080      s.append((char) 8839); // superset of or equal to, U+2287 ISOtech
1081    else if (c.equals("oplus"))
1082      s.append((char) 8853); // circled plus = direct sum, U+2295 ISOamsb
1083    else if (c.equals("otimes"))
1084      s.append((char) 8855); // circled times = vector product, U+2297 ISOamsb -->
1085    else if (c.equals("perp"))
1086      s.append((char) 8869); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
1087    else if (c.equals("sdot"))
1088      s.append((char) 8901); // dot operator, U+22C5 ISOamsb
1089    else if (c.equals("lceil"))
1090      s.append((char) 8968); // left ceiling = apl upstile, U+2308 ISOamsc
1091    else if (c.equals("rceil"))
1092      s.append((char) 8969); // right ceiling, U+2309 ISOamsc
1093    else if (c.equals("lfloor"))
1094      s.append((char) 8970); // left floor = apl downstile, U+230A ISOamsc
1095    else if (c.equals("rfloor"))
1096      s.append((char) 8971); // right floor, U+230B ISOamsc
1097    else if (c.equals("lang"))
1098      s.append((char) 9001); // left-pointing angle bracket = bra, U+2329 ISOtech
1099    else if (c.equals("rang"))
1100      s.append((char) 9002); // right-pointing angle bracket = ket, U+232A ISOtech
1101    else if (c.equals("loz"))
1102      s.append((char) 9674); // lozenge, U+25CA ISOpub
1103    else if (c.equals("spades"))
1104      s.append((char) 9824); // black spade suit, U+2660 ISOpub
1105    else if (c.equals("clubs"))
1106      s.append((char) 9827); // black club suit = shamrock, U+2663 ISOpub
1107    else if (c.equals("hearts"))
1108      s.append((char) 9829); // black heart suit = valentine, U+2665 ISOpub
1109    else if (c.equals("diams"))
1110      s.append((char) 9830); // black diamond suit, U+2666 ISOpub --
1111    else if (c.equals("ndash"))
1112      s.append((char) 8211); 
1113    else if (c.equals("mdash"))
1114      s.append((char) 8212); 
1115    else if (c.equals("ldquo"))
1116      s.append((char) 8221); 
1117    else if (c.equals("rdquo"))
1118      s.append((char) 201D); 
1119    else
1120      throw new FHIRFormatError("unable to parse character reference '" + c + "'' (last text = '" + lastText + "'" + descLoc());
1121  }
1122  
1123  private boolean isInteger(String s, int base) {
1124    try {
1125      Integer.parseInt(s, base);
1126      return true;
1127    } catch (Exception e) {
1128      return false;
1129    }
1130  }
1131
1132  public XhtmlNode parseFragment(String source) throws IOException, FHIRException  {
1133    rdr = new StringReader(source);
1134    return parseFragment();
1135  }
1136  
1137  public XhtmlNode parseFragment(InputStream input) throws IOException, FHIRException  {
1138    rdr = new InputStreamReader(input);
1139    return parseFragment();
1140  }
1141  
1142  private XhtmlNode parseFragment() throws IOException, FHIRException 
1143  {
1144    skipWhiteSpace();
1145    if (peekChar() != '<')
1146      throw new FHIRException("Unable to Parse HTML - does not start with tag. Found "+peekChar()+descLoc());
1147    readChar();
1148    if (peekChar() == '?') {
1149      readToTagEnd();
1150      skipWhiteSpace();
1151      if (peekChar() != '<')
1152        throw new FHIRException("Unable to Parse HTML - does not start with tag after processing instruction. Found "+peekChar()+descLoc());
1153      readChar();
1154    }
1155    String n = readName().toLowerCase();
1156    readToTagEnd();
1157    XhtmlNode result = new XhtmlNode(NodeType.Element);
1158    
1159    int colonIndex = n.indexOf(':');
1160    if (colonIndex != -1) {
1161      n = n.substring(colonIndex + 1);
1162    }
1163    
1164    result.setName(n);
1165    unwindPoint = null;
1166    List<XhtmlNode> p = new ArrayList<XhtmlNode>();
1167    parseElementInner(result, p, null, true);
1168
1169    return result;
1170  }
1171
1172  
1173}