001/* 002Copyright (c) 2011+, HL7, Inc 003All rights reserved. 004 005Redistribution and use in source and binary forms, with or without modification, 006are permitted provided that the following conditions are met: 007 008 * Redistributions of source code must retain the above copyright notice, this 009 list of conditions and the following disclaimer. 010 * Redistributions in binary form must reproduce the above copyright notice, 011 this list of conditions and the following disclaimer in the documentation 012 and/or other materials provided with the distribution. 013 * Neither the name of HL7 nor the names of its contributors may be used to 014 endorse or promote products derived from this software without specific 015 prior written permission. 016 017THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 018ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 019WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 020IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 021INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 022NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 023PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 024WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 025ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 026POSSIBILITY OF SUCH DAMAGE. 027 028*/ 029package org.hl7.fhir.utilities.xhtml; 030 031import java.io.IOException; 032import java.io.InputStream; 033import java.io.InputStreamReader; 034import java.io.Reader; 035import java.io.StringReader; 036import java.util.ArrayList; 037import java.util.HashMap; 038import java.util.HashSet; 039import java.util.List; 040import java.util.Map; 041import java.util.Set; 042 043import org.hl7.fhir.exceptions.FHIRException; 044import org.hl7.fhir.exceptions.FHIRFormatError; 045import org.hl7.fhir.utilities.xhtml.XhtmlNode.Location; 046import org.w3c.dom.Attr; 047import org.w3c.dom.Element; 048import org.w3c.dom.Node; 049import org.xmlpull.v1.XmlPullParser; 050import org.xmlpull.v1.XmlPullParserException; 051 052public class XhtmlParser { 053 public static final String XHTML_NS = "http://www.w3.org/1999/xhtml"; 054 055 public class NSMap { 056 private Map<String, String> nslist = new HashMap<String, String>(); 057 058 public NSMap(NSMap nsm) { 059 if (nsm != null) 060 nslist.putAll(nsm.nslist); 061 } 062 063 public void def(String ns) { 064 nslist.put("", ns); 065 } 066 067 public void ns(String abbrev, String ns) { 068 nslist.put(abbrev, ns); 069 } 070 071 public String def() { 072 return nslist.get(""); 073 } 074 075 public boolean hasDef() { 076 return nslist.containsKey(""); 077 } 078 079 public String get(String abbrev) { 080 return nslist.containsKey(abbrev) ? nslist.get(abbrev) : "http://error/undefined-namespace"; 081 } 082 } 083 084 public class QName { 085 private String ns; 086 private String name; 087 088 public QName(String src) { 089 if (src.contains(":")) { 090 ns = src.substring(0, src.indexOf(":")); 091 name = src.substring(src.indexOf(":")+1); 092 } else { 093 ns = null; 094 name = src; 095 } 096 } 097 098 public String getName() { 099 return name; 100 } 101 102 public boolean hasNs() { 103 return ns != null; 104 } 105 106 public String getNs() { 107 return ns; 108 } 109 110 @Override 111 public String toString() { 112 return ns+"::"+name; 113 } 114 115 } 116 117 private Set<String> elements = new HashSet<String>(); 118 private Set<String> attributes = new HashSet<String>(); 119 120 121 public XhtmlParser() { 122 super(); 123 policy = ParserSecurityPolicy.Accept; // for general parsing 124 125 // set up sets 126 elements.add("p"); 127 elements.add("br"); 128 elements.add("div"); 129 elements.add("h1"); 130 elements.add("h2"); 131 elements.add("h3"); 132 elements.add("h4"); 133 elements.add("h5"); 134 elements.add("h6"); 135 elements.add("a"); 136 elements.add("span"); 137 elements.add("b"); 138 elements.add("em"); 139 elements.add("i"); 140 elements.add("strong"); 141 elements.add("small"); 142 elements.add("big"); 143 elements.add("tt"); 144 elements.add("small"); 145 elements.add("dfn"); 146 elements.add("q"); 147 elements.add("var"); 148 elements.add("abbr"); 149 elements.add("acronym"); 150 elements.add("cite"); 151 elements.add("blockquote"); 152 elements.add("hr"); 153 elements.add("address"); 154 elements.add("bdo"); 155 elements.add("kbd"); 156 elements.add("q"); 157 elements.add("sub"); 158 elements.add("sup"); 159 elements.add("ul"); 160 elements.add("ol"); 161 elements.add("li"); 162 elements.add("dl"); 163 elements.add("dt"); 164 elements.add("dd"); 165 elements.add("pre"); 166 elements.add("table"); 167 elements.add("caption"); 168 elements.add("colgroup"); 169 elements.add("col"); 170 elements.add("thead"); 171 elements.add("tr"); 172 elements.add("tfoot"); 173 elements.add("tbody"); 174 elements.add("th"); 175 elements.add("td"); 176 elements.add("code"); 177 elements.add("samp"); 178 elements.add("img"); 179 elements.add("map"); 180 elements.add("area"); 181 182 attributes.add("title"); 183 attributes.add("style"); 184 attributes.add("class"); 185 attributes.add("id"); 186 attributes.add("lang"); 187 attributes.add("xml:lang"); 188 attributes.add("dir"); 189 attributes.add("accesskey"); 190 attributes.add("tabindex"); 191 // tables: 192 attributes.add("span"); 193 attributes.add("width"); 194 attributes.add("align"); 195 attributes.add("valign"); 196 attributes.add("char"); 197 attributes.add("charoff"); 198 attributes.add("abbr"); 199 attributes.add("axis"); 200 attributes.add("headers"); 201 attributes.add("scope"); 202 attributes.add("rowspan"); 203 attributes.add("colspan"); 204 205 attributes.add("a.href"); 206 attributes.add("a.name"); 207 attributes.add("img.src"); 208 attributes.add("img.border"); 209 attributes.add("div.xmlns"); 210 attributes.add("blockquote.cite"); 211 attributes.add("q.cite"); 212 attributes.add("a.charset"); 213 attributes.add("a.type"); 214 attributes.add("a.name"); 215 attributes.add("a.href"); 216 attributes.add("a.hreflang"); 217 attributes.add("a.rel"); 218 attributes.add("a.rev"); 219 attributes.add("a.shape"); 220 attributes.add("a.coords"); 221 attributes.add("img.src"); 222 attributes.add("img.alt"); 223 attributes.add("img.longdesc"); 224 attributes.add("img.height"); 225 attributes.add("img.width"); 226 attributes.add("img.usemap"); 227 attributes.add("img.ismap"); 228 attributes.add("map.name"); 229 attributes.add("area.shape"); 230 attributes.add("area.coords"); 231 attributes.add("area.href"); 232 attributes.add("area.nohref"); 233 attributes.add("area.alt"); 234 attributes.add("table.summary"); 235 attributes.add("table.width"); 236 attributes.add("table.border"); 237 attributes.add("table.frame"); 238 attributes.add("table.rules"); 239 attributes.add("table.cellspacing"); 240 attributes.add("table.cellpadding"); 241} 242 243public enum ParserSecurityPolicy { 244 Accept, 245 Drop, 246 Reject 247 } 248 249 private ParserSecurityPolicy policy; 250 251 private boolean trimWhitespace; 252 private boolean mustBeWellFormed = true; 253 private boolean validatorMode; 254 255 public boolean isTrimWhitespace() { 256 return trimWhitespace; 257 } 258 259 public void setTrimWhitespace(boolean trimWhitespace) { 260 this.trimWhitespace = trimWhitespace; 261 } 262 263 public boolean isMustBeWellFormed() { 264 return mustBeWellFormed; 265 } 266 267 public XhtmlParser setMustBeWellFormed(boolean mustBeWellFormed) { 268 this.mustBeWellFormed = mustBeWellFormed; 269 return this; 270 } 271 272 273 public boolean isValidatorMode() { 274 return validatorMode; 275 } 276 277 public XhtmlParser setValidatorMode(boolean validatorMode) { 278 this.validatorMode = validatorMode; 279 return this; 280 } 281 282 public ParserSecurityPolicy getPolicy() { 283 return policy; 284 } 285 286 public void setPolicy(ParserSecurityPolicy policy) { 287 this.policy = policy; 288 } 289 290 public XhtmlNode parseHtmlNode(Element node) throws FHIRFormatError { 291 return parseHtmlNode(node, null); 292 } 293 294 public XhtmlNode parseHtmlNode(Element node, String defaultNS) throws FHIRFormatError { 295 XhtmlNode res = parseNode(node, defaultNS); 296 if (res.getNsDecl() == null) 297 res.getAttributes().put("xmlns", XHTML_NS); 298 return res; 299 } 300 301 private XhtmlNode parseNode(Element node, String defaultNS) throws FHIRFormatError { 302 XhtmlNode res = new XhtmlNode(NodeType.Element); 303 res.setName(node.getLocalName()); 304 defaultNS = checkNS(res, node, defaultNS); 305 for (int i = 0; i < node.getAttributes().getLength(); i++) { 306 Attr attr = (Attr) node.getAttributes().item(i); 307 if (attributeIsOk(res.getName(), attr.getName(), attr.getValue()) && !attr.getLocalName().startsWith("xmlns")) 308 res.getAttributes().put(attr.getName(), attr.getValue()); 309 } 310 Node child = node.getFirstChild(); 311 while (child != null) { 312 if (child.getNodeType() == Node.TEXT_NODE) { 313 res.addText(child.getTextContent()); 314 } else if (child.getNodeType() == Node.COMMENT_NODE) { 315 res.addComment(child.getTextContent()); 316 } else if (child.getNodeType() == Node.ELEMENT_NODE) { 317 if (elementIsOk(child.getLocalName())) 318 res.getChildNodes().add(parseNode((Element) child, defaultNS)); 319 } else 320 throw new FHIRFormatError("Unhandled XHTML feature: "+Integer.toString(child.getNodeType())+descLoc()); 321 child = child.getNextSibling(); 322 } 323 return res; 324 } 325 326 private String checkNS(XhtmlNode res, Element node, String defaultNS) { 327 if (!validatorMode) 328 return null; 329 String ns = node.getNamespaceURI(); 330 if (ns == null) 331 return null; 332 if (!ns.equals(defaultNS)) { 333 res.getAttributes().put("xmlns", ns); 334 return ns; 335 } 336 return defaultNS; 337 } 338 339 public XhtmlNode parseHtmlNode(XmlPullParser xpp) throws XmlPullParserException, IOException, FHIRFormatError { 340 XhtmlNode res = parseNode(xpp); 341 if (res.getNsDecl() == null) 342 res.getAttributes().put("xmlns", XHTML_NS); 343 return res; 344 345 } 346 private XhtmlNode parseNode(XmlPullParser xpp) throws XmlPullParserException, IOException, FHIRFormatError { 347 XhtmlNode res = new XhtmlNode(NodeType.Element); 348 res.setName(xpp.getName()); 349 350 for (int i = 0; i < xpp.getAttributeCount(); i++) { 351 if (attributeIsOk(xpp.getName(), xpp.getAttributeName(i), xpp.getAttributeValue(i))) 352 res.getAttributes().put(xpp.getAttributeName(i), xpp.getAttributeValue(i)); 353 } 354 int eventType = xpp.next(); 355 while (eventType != XmlPullParser.END_TAG) { 356 if (eventType == XmlPullParser.TEXT) { 357 res.addText(xpp.getText()); 358 xpp.next(); 359 } else if (eventType == XmlPullParser.COMMENT) { 360 res.addComment(xpp.getText()); 361 xpp.next(); 362 } else if (eventType == XmlPullParser.START_TAG) { 363 if (elementIsOk(xpp.getName())) 364 res.getChildNodes().add(parseNode(xpp)); 365 } else 366 throw new FHIRFormatError("Unhandled XHTML feature: "+Integer.toString(eventType)+descLoc()); 367 eventType = xpp.getEventType(); 368 } 369 xpp.next(); 370 return res; 371 } 372 373 private boolean attributeIsOk(String elem, String attr, String value) throws FHIRFormatError { 374 if (validatorMode) 375 return true; 376 boolean ok = attributes.contains(attr) || attributes.contains(elem+"."+attr); 377 if (ok) 378 return true; 379 else switch (policy) { 380 case Accept: return true; 381 case Drop: return false; 382 case Reject: throw new FHIRFormatError("Illegal HTML attribute "+elem+"."+attr); 383 } 384 385 if ((elem+"."+attr).equals("img.src") && !(value.startsWith("#") || value.startsWith("http:") || value.startsWith("https:"))) { 386 switch (policy) { 387 case Accept: return true; 388 case Drop: return false; 389 case Reject: throw new FHIRFormatError("Illegal Image Reference "+value); 390 } 391 } 392 return false; 393 } 394 395private boolean elementIsOk(String name) throws FHIRFormatError { 396 if (validatorMode) 397 return true; 398 boolean ok = elements.contains(name); 399 if (ok) 400 return true; 401 else switch (policy) { 402 case Accept: return true; 403 case Drop: return false; 404 case Reject: throw new FHIRFormatError("Illegal HTML element "+name); 405 } 406 return false; 407} 408 409 private String descLoc() { 410 return " at line "+Integer.toString(line)+" column "+Integer.toString(col); 411 } 412 413 private Reader rdr; 414 private String cache = ""; 415 private XhtmlNode unwindPoint; 416 private String lastText = ""; 417 private int line = 1; 418 private int col = 0; 419 private char lastChar; 420 private Location lastLoc; 421 422 public XhtmlDocument parse(String source, String entryName) throws FHIRFormatError, IOException { 423 rdr = new StringReader(source); 424 return parse(entryName); 425 } 426 427 public XhtmlDocument parse(InputStream input, String entryName) throws FHIRFormatError, IOException { 428 rdr = new InputStreamReader(input, "UTF-8"); 429 return parse(entryName); 430 } 431 432 private XhtmlDocument parse(String entryName) throws FHIRFormatError, IOException 433 { 434 XhtmlDocument result = new XhtmlDocument(); 435 skipWhiteSpaceAndComments(result); 436 if (peekChar() != '<') 437 throw new FHIRFormatError("Unable to Parse HTML - does not start with tag. Found "+peekChar()+descLoc()); 438 readChar(); 439 markLocation(); 440 QName n = new QName(readName().toLowerCase()); 441 if ((entryName != null) && !n.getName().equals(entryName)) 442 throw new FHIRFormatError("Unable to Parse HTML - starts with '"+n+"' not '"+entryName+"'"+descLoc()); 443 XhtmlNode root = result.addTag(n.getName()); 444 root.setLocation(markLocation()); 445 parseAttributes(root); 446 markLocation(); 447 NSMap nsm = checkNamespaces(n, root, null, true); 448 if (readChar() == '/') { 449 if (peekChar() != '>') 450 throw new FHIRFormatError("unexpected non-end of element "+n+" "+descLoc()); 451 readChar(); 452 } else { 453 unwindPoint = null; 454 List<XhtmlNode> p = new ArrayList<XhtmlNode>(); 455 parseElementInner(root, p, nsm, true); 456 } 457 return result; 458 } 459 460 private Location markLocation() { 461 Location res = lastLoc; 462 lastLoc = new Location(line, col); 463 return res; 464 } 465 466 private NSMap checkNamespaces(QName n, XhtmlNode node, NSMap nsm, boolean root) { 467 // what we do here is strip out any stated namespace attributes, putting them in the namesapce map 468 // then we figure out what the namespace of this element is, and state it explicitly if it's not the default 469 470 // but we don't bother with any of this if we're not validating 471 if (!validatorMode) 472 return null; 473 NSMap result = new NSMap(nsm); 474 List<String> nsattrs = new ArrayList<String>(); 475 for (String an : node.getAttributes().keySet()) { 476 if (an.equals("xmlns")) { 477 result.def(node.getAttribute(an)); 478 nsattrs.add(an); 479 } 480 if (an.startsWith("xmlns:")) { 481 result.ns(an.substring(6), node.getAttribute(an)); 482 nsattrs.add(an); 483 } 484 } 485 for (String s : nsattrs) 486 node.getAttributes().remove(s); 487 if (n.hasNs()) { 488 String nns = result.get(n.getNs()); 489 if (!nns.equals(result.def())) { 490 node.getAttributes().put("xmlns", nns); 491 result.def(nns); 492 } 493 } else if (root && result.hasDef()) { 494 node.getAttributes().put("xmlns", result.def()); 495 } 496 return result; 497 } 498 499 private void addTextNode(XhtmlNode node, StringBuilder s) 500 { 501 String t = isTrimWhitespace() ? s.toString().trim() : s.toString(); 502 if (t.length() > 0) 503 { 504 lastText = t; 505 // System.out.println(t); 506 node.addText(t).setLocation(markLocation()); 507 s.setLength(0); 508 } 509 } 510 private void parseElementInner(XhtmlNode node, List<XhtmlNode> parents, NSMap nsm, boolean escaping) throws FHIRFormatError, IOException 511 { 512 StringBuilder s = new StringBuilder(); 513 while (peekChar() != '\0' && !parents.contains(unwindPoint) && !(node == unwindPoint)) 514 { 515 if (peekChar() == '<') 516 { 517 addTextNode(node, s); 518 readChar(); 519 if (peekChar() == '!') { 520 String sc = readToCommentEnd(); 521 if (sc.startsWith("DOCTYPE")) 522 throw new FHIRFormatError("Malformed XHTML: Found a DocType declaration, and these are not allowed (XXE security vulnerability protection)"); 523 node.addComment(sc).setLocation(markLocation()); 524 } else if (peekChar() == '?') 525 node.addComment(readToTagEnd()).setLocation(markLocation()); 526 else if (peekChar() == '/') { 527 readChar(); 528 QName n = new QName(readToTagEnd()); 529 if (node.getName().equals(n.getName())) 530 return; 531 else 532 { 533 if (mustBeWellFormed) 534 throw new FHIRFormatError("Malformed XHTML: Found \"</"+n.getName()+">\" expecting \"</"+node.getName()+">\""+descLoc()); 535 for (int i = parents.size() - 1; i >= 0; i--) 536 { 537 if (parents.get(i).getName().equals(n)) 538 unwindPoint = parents.get(i); 539 } 540 if (unwindPoint != null) 541 { 542 for (int i = parents.size(); i > 0; i--) 543 { 544 if (i < parents.size() && parents.get(i) == unwindPoint) 545 return; 546 if (i == parents.size()) 547 { 548 parents.get(i - 1).getChildNodes().addAll(node.getChildNodes()); 549 node.getChildNodes().clear(); 550 } 551 else 552 { 553 parents.get(i - 1).getChildNodes().addAll(parents.get(i).getChildNodes()); 554 parents.get(i).getChildNodes().clear(); 555 } 556 } 557 } 558 } 559 } 560 else if (Character.isLetterOrDigit(peekChar())) 561 { 562 parseElement(node, parents, nsm); 563 } 564 else 565 throw new FHIRFormatError("Unable to Parse HTML - node '" + node.getName() + "' has unexpected content '"+peekChar()+"' (last text = '"+lastText+"'"+descLoc()); 566 } 567 else if (peekChar() == '&') // escaping && 568 { 569 parseLiteral(s); 570 } 571 else 572 s.append(readChar()); 573 } 574 addTextNode(node, s); 575 } 576 577 private void parseElement(XhtmlNode parent, List<XhtmlNode> parents, NSMap nsm) throws IOException, FHIRFormatError 578 { 579 markLocation(); 580 QName name = new QName(readName()); 581 XhtmlNode node = parent.addTag(name.getName()); 582 node.setLocation(markLocation()); 583 List<XhtmlNode> newParents = new ArrayList<XhtmlNode>(); 584 newParents.addAll(parents); 585 newParents.add(parent); 586 parseAttributes(node); 587 markLocation(); 588 nsm = checkNamespaces(name, node, nsm, false); 589 if (readChar() == '/') { 590 if (peekChar() != '>') 591 throw new FHIRFormatError("unexpected non-end of element "+name+" "+descLoc()); 592 readChar(); 593 } else { 594 parseElementInner(node, newParents, nsm, "script".equals(name.getName())); 595 } 596 } 597 598 private void parseAttributes(XhtmlNode node) throws FHIRFormatError, IOException 599 { 600 while (Character.isWhitespace(peekChar())) 601 readChar(); 602 while (peekChar() != '>' && peekChar() != '/' && peekChar() != '\0') 603 { 604 String name = readName(); 605 if (name.length() == 0) 606 { 607 throw new FHIRFormatError("Unable to read attribute on <"+node.getName()+">"+descLoc()); 608 } 609 while (Character.isWhitespace(peekChar())) 610 readChar(); 611 612 if (isNameChar(peekChar()) || peekChar() == '>' || peekChar() == '/') 613 node.getAttributes().put(name, null); 614 else if (peekChar() != '=') 615 { 616 throw new FHIRFormatError("Unable to read attribute '"+name+"' value on <"+node.getName()+">"+descLoc()); 617 } 618 else 619 { 620 readChar(); 621 while (Character.isWhitespace(peekChar())) 622 readChar(); 623 if (peekChar() == '"' || peekChar() == '\'') 624 node.getAttributes().put(name, parseAttributeValue(readChar())); 625 else 626 node.getAttributes().put(name, parseAttributeValue('\0')); 627 } 628 while (Character.isWhitespace(peekChar())) 629 readChar(); 630 } 631 } 632 633 private String parseAttributeValue(char term) throws IOException, FHIRFormatError 634 { 635 StringBuilder b = new StringBuilder(); 636 while (peekChar() != '\0' && peekChar() != '>' && (term != '\0' || peekChar() != '/') && peekChar() != term) 637 { 638 if (peekChar() == '&') 639 { 640 parseLiteral(b); 641 } 642 else 643 b.append(readChar()); 644 } 645 if (peekChar() == term) 646 readChar(); 647 return b.toString(); 648 } 649 650 651 private void skipWhiteSpaceAndComments(XhtmlNode focus) throws IOException, FHIRFormatError { 652 while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff)) 653 readChar(); 654 if (peekChar() == '<') 655 { 656 char ch = readChar(); 657 if (peekChar() == '!') { 658 readChar(); 659 if (peekChar() == '-') { 660 readChar(); 661 if (peekChar() == '-') { 662 readChar(); 663 if (peekChar() == ' ') 664 readChar(); 665 focus.addComment(readToCommentEnd()); 666 } else 667 throw new FHIRFormatError("unrecognised element type <!"+peekChar()+descLoc()); 668 } else 669 focus.addDocType(readToDocTypeEnd()); 670 skipWhiteSpaceAndComments(focus); 671 } else if (peekChar() == '?') { 672 String r = readToTagEnd(); 673 focus.addInstruction(r.substring(1, r.length()-1)); 674 skipWhiteSpaceAndComments(focus); 675 } else 676 pushChar(ch); 677 } 678 } 679 680 private void skipWhiteSpace() throws IOException { 681 if (trimWhitespace) 682 while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff)) 683 readChar(); 684 } 685 686 private void pushChar(char ch) { 687 cache = Character.toString(ch)+cache; 688 } 689 690 private char peekChar() throws IOException 691 { 692 if (cache.length() > 0) 693 return cache.charAt(0); 694 else if (!rdr.ready()) 695 return '\0'; 696 else 697 { 698 char c = (char)rdr.read(); 699 if (c == (char)-1) 700 { 701 cache = ""; 702 return '\0'; 703 } 704 cache = Character.toString(c); 705 return c; 706 } 707 } 708 709 private char readChar() throws IOException 710 { 711 char c; 712 if (cache.length() > 0) 713 { 714 c = cache.charAt(0); 715 cache = cache.length() == 1 ? "" : cache.substring(1); 716 } 717 else if (!rdr.ready()) 718 c = '\0'; 719 else 720 c = (char)rdr.read(); 721 if (c == '\r' || c == '\n') { 722 if (c == '\r' || lastChar != '\r') { 723 line++; 724 col = 0; 725 } 726 lastChar = c; 727 } 728 col++; 729 return c; 730 } 731 732 private String readToTagEnd() throws IOException, FHIRFormatError 733 { 734 StringBuilder s = new StringBuilder(); 735 while (peekChar() != '>' && peekChar() != '\0') 736 s.append(readChar()); 737 if (peekChar() != '\0') 738 { 739 readChar(); 740 skipWhiteSpace(); 741 } else if (mustBeWellFormed) 742 throw new FHIRFormatError("Unexpected termination of html source"+descLoc()); 743 return s.toString(); 744 } 745 746 private String readToDocTypeEnd() throws IOException, FHIRFormatError 747 { 748 StringBuilder s = new StringBuilder(); 749 750 boolean done = false; 751 while (!done) { 752 char c = peekChar(); 753 if (c == '>') { 754 done = true; 755 readChar(); 756 } else if (c != '\0') 757 s.append(readChar()); 758 else if (mustBeWellFormed) 759 throw new FHIRFormatError("Unexpected termination of html source"+descLoc()); 760 } 761 return s.toString(); 762 } 763 764 private String readToCommentEnd() throws IOException, FHIRFormatError 765 { 766 if (peekChar() == '!') 767 readChar(); 768 StringBuilder s = new StringBuilder(); 769 770 boolean simple = true; 771 if (peekChar() == '-') { 772 readChar(); 773 simple = peekChar() != '-'; 774 if (simple) 775 s.append('-'); 776 else 777 readChar(); 778 } 779 780 boolean done = false; 781 while (!done) { 782 char c = peekChar(); 783 if (c == '-') { 784 readChar(); 785 if (peekChar() == '-') { 786 readChar(); 787 if (peekChar() == '>') { 788 done = true; 789 } else 790 s.append("--"); 791 } else 792 s.append('-'); 793 } else if (simple && peekChar() == '>') { 794 done = true; 795 } else if (c != '\0') 796 s.append(readChar()); 797 else if (mustBeWellFormed) 798 throw new FHIRFormatError("Unexpected termination of html source"+descLoc()); 799 } 800 if (peekChar() != '\0') 801 { 802 readChar(); 803 skipWhiteSpace(); 804 } 805 return s.toString(); 806 } 807 808 private boolean isNameChar(char ch) 809 { 810 return Character.isLetterOrDigit(ch) || ch == '_' || ch == '-' || ch == ':'; 811 } 812 813 private String readName() throws IOException 814 { 815 StringBuilder s = new StringBuilder(); 816 while (isNameChar(peekChar())) 817 s.append(readChar()); 818 return s.toString(); 819 } 820 821 private String readUntil(char ch) throws IOException 822 { 823 StringBuilder s = new StringBuilder(); 824 while (peekChar() != 0 && peekChar() != ch) 825 s.append(readChar()); 826 readChar(); 827 return s.toString(); 828 } 829 830 private void parseLiteral(StringBuilder s) throws IOException, FHIRFormatError { 831 // UInt16 w; 832 readChar(); 833 String c = readUntil(';'); 834 if (c.equals("apos")) 835 s.append('\''); 836 else if (c.equals("quot")) 837 s.append('"'); 838 else if (c.equals("nbsp")) 839 s.append(XhtmlNode.NBSP); 840 else if (c.equals("amp")) 841 s.append('&'); 842 else if (c.equals("lsquo")) 843 s.append((char) 8216); // right single quotation, U+2019 ISOnum 844 else if (c.equals("rsquo")) 845 s.append((char) 8217); // right single quotation, U+2019 ISOnum 846 //s.append((char)0x60); // right single quote 847 //s.append('’'); 848 else if (c.equals("gt")) 849 s.append('>'); 850 else if (c.equals("lt")) 851 s.append('<'); 852 else if (c.equals("copy")) 853 s.append((char) 169); 854 else if (c.equals("reg")) 855 s.append((char) 174); 856 else if (c.equals("sect")) 857 s.append((char) 0xA7); 858 else if (c.charAt(0) == '#') { 859 if (isInteger(c.substring(1), 10)) 860 s.append((char) Integer.parseInt(c.substring(1))); 861 else if (c.charAt(1) == 'x' && isInteger(c.substring(2), 16)) 862 s.append((char) Integer.parseInt(c.substring(2), 16)); 863 } else if (c.equals("fnof")) 864 s.append((char) 402); // latin small f with hook = function = florin, U+0192 ISOtech --> 865 else if (c.equals("Alpha")) 866 s.append((char) 913); // greek capital letter alpha, U+0391 867 else if (c.equals("Beta")) 868 s.append((char) 914); // greek capital letter beta, U+0392 869 else if (c.equals("Gamma")) 870 s.append((char) 915); // greek capital letter gamma, U+0393 ISOgrk3 871 else if (c.equals("Delta")) 872 s.append((char) 916); // greek capital letter delta, U+0394 ISOgrk3 873 else if (c.equals("Epsilon")) 874 s.append((char) 917); // greek capital letter epsilon, U+0395 875 else if (c.equals("Zeta")) 876 s.append((char) 918); // greek capital letter zeta, U+0396 877 else if (c.equals("Eta")) 878 s.append((char) 919); // greek capital letter eta, U+0397 879 else if (c.equals("Theta")) 880 s.append((char) 920); // greek capital letter theta, U+0398 ISOgrk3 881 else if (c.equals("Iota")) 882 s.append((char) 921); // greek capital letter iota, U+0399 883 else if (c.equals("Kappa")) 884 s.append((char) 922); // greek capital letter kappa, U+039A 885 else if (c.equals("Lambda")) 886 s.append((char) 923); // greek capital letter lambda, U+039B ISOgrk3 887 else if (c.equals("Mu")) 888 s.append((char) 924); // greek capital letter mu, U+039C 889 else if (c.equals("Nu")) 890 s.append((char) 925); // greek capital letter nu, U+039D 891 else if (c.equals("Xi")) 892 s.append((char) 926); // greek capital letter xi, U+039E ISOgrk3 893 else if (c.equals("Omicron")) 894 s.append((char) 927); // greek capital letter omicron, U+039F 895 else if (c.equals("Pi")) 896 s.append((char) 928); // greek capital letter pi, U+03A0 ISOgrk3 897 else if (c.equals("Rho")) 898 s.append((char) 929); // greek capital letter rho, U+03A1 899 else if (c.equals("Sigma")) 900 s.append((char) 931); // greek capital letter sigma, U+03A3 ISOgrk3 901 else if (c.equals("Tau")) 902 s.append((char) 932); // greek capital letter tau, U+03A4 903 else if (c.equals("Upsilon")) 904 s.append((char) 933); // greek capital letter upsilon, U+03A5 ISOgrk3 905 else if (c.equals("Phi")) 906 s.append((char) 934); // greek capital letter phi, U+03A6 ISOgrk3 907 else if (c.equals("Chi")) 908 s.append((char) 935); // greek capital letter chi, U+03A7 909 else if (c.equals("Psi")) 910 s.append((char) 936); // greek capital letter psi, U+03A8 ISOgrk3 911 else if (c.equals("Omega")) 912 s.append((char) 937); // greek capital letter omega, U+03A9 ISOgrk3 913 else if (c.equals("alpha")) 914 s.append((char) 945); // greek small letter alpha, U+03B1 ISOgrk3 915 else if (c.equals("beta")) 916 s.append((char) 946); // greek small letter beta, U+03B2 ISOgrk3 917 else if (c.equals("gamma")) 918 s.append((char) 947); // greek small letter gamma, U+03B3 ISOgrk3 919 else if (c.equals("delta")) 920 s.append((char) 948); // greek small letter delta, U+03B4 ISOgrk3 921 else if (c.equals("epsilon")) 922 s.append((char) 949); // greek small letter epsilon, U+03B5 ISOgrk3 923 else if (c.equals("zeta")) 924 s.append((char) 950); // greek small letter zeta, U+03B6 ISOgrk3 925 else if (c.equals("eta")) 926 s.append((char) 951); // greek small letter eta, U+03B7 ISOgrk3 927 else if (c.equals("theta")) 928 s.append((char) 952); // greek small letter theta, U+03B8 ISOgrk3 929 else if (c.equals("iota")) 930 s.append((char) 953); // greek small letter iota, U+03B9 ISOgrk3 931 else if (c.equals("kappa")) 932 s.append((char) 954); // greek small letter kappa, U+03BA ISOgrk3 933 else if (c.equals("lambda")) 934 s.append((char) 955); // greek small letter lambda, U+03BB ISOgrk3 935 else if (c.equals("mu")) 936 s.append((char) 956); // greek small letter mu, U+03BC ISOgrk3 937 else if (c.equals("nu")) 938 s.append((char) 957); // greek small letter nu, U+03BD ISOgrk3 939 else if (c.equals("xi")) 940 s.append((char) 958); // greek small letter xi, U+03BE ISOgrk3 941 else if (c.equals("omicron")) 942 s.append((char) 959); // greek small letter omicron, U+03BF NEW 943 else if (c.equals("pi")) 944 s.append((char) 960); // greek small letter pi, U+03C0 ISOgrk3 945 else if (c.equals("rho")) 946 s.append((char) 961); // greek small letter rho, U+03C1 ISOgrk3 947 else if (c.equals("sigmaf")) 948 s.append((char) 962); // greek small letter final sigma, U+03C2 ISOgrk3 949 else if (c.equals("sigma")) 950 s.append((char) 963); // greek small letter sigma, U+03C3 ISOgrk3 951 else if (c.equals("tau")) 952 s.append((char) 964); // greek small letter tau, U+03C4 ISOgrk3 953 else if (c.equals("upsilon")) 954 s.append((char) 965); // greek small letter upsilon, U+03C5 ISOgrk3 955 else if (c.equals("phi")) 956 s.append((char) 966); // greek small letter phi, U+03C6 ISOgrk3 957 else if (c.equals("chi")) 958 s.append((char) 967); // greek small letter chi, U+03C7 ISOgrk3 959 else if (c.equals("psi")) 960 s.append((char) 968); // greek small letter psi, U+03C8 ISOgrk3 961 else if (c.equals("omega")) 962 s.append((char) 969); // greek small letter omega, U+03C9 ISOgrk3 963 else if (c.equals("thetasym")) 964 s.append((char) 977); // greek small letter theta symbol, U+03D1 NEW 965 else if (c.equals("upsih")) 966 s.append((char) 978); // greek upsilon with hook symbol, U+03D2 NEW 967 else if (c.equals("piv")) 968 s.append((char) 982); // greek pi symbol, U+03D6 ISOgrk3 969 else if (c.equals("bull")) 970 s.append((char) 8226); // bullet = black small circle, U+2022 ISOpub 971 else if (c.equals("hellip")) 972 s.append((char) 8230); // horizontal ellipsis = three dot leader, U+2026 ISOpub 973 else if (c.equals("prime")) 974 s.append((char) 8242); // prime = minutes = feet, U+2032 ISOtech 975 else if (c.equals("Prime")) 976 s.append((char) 8243); // double prime = seconds = inches, U+2033 ISOtech 977 else if (c.equals("oline")) 978 s.append((char) 8254); // overline = spacing overscore, U+203E NEW 979 else if (c.equals("frasl")) 980 s.append((char) 8260); // fraction slash, U+2044 NEW 981 else if (c.equals("weierp")) 982 s.append((char) 8472); // script capital P = power set = Weierstrass p, U+2118 ISOamso 983 else if (c.equals("image")) 984 s.append((char) 8465); // blackletter capital I = imaginary part, U+2111 ISOamso 985 else if (c.equals("real")) 986 s.append((char) 8476); // blackletter capital R = real part symbol, U+211C ISOamso 987 else if (c.equals("trade")) 988 s.append((char) 8482); // trade mark sign, U+2122 ISOnum 989 else if (c.equals("alefsym")) 990 s.append((char) 8501); // alef symbol = first transfinite cardinal, U+2135 NEW 991 else if (c.equals("larr")) 992 s.append((char) 8592); // leftwards arrow, U+2190 ISOnum 993 else if (c.equals("uarr")) 994 s.append((char) 8593); // upwards arrow, U+2191 ISOnum 995 else if (c.equals("rarr")) 996 s.append((char) 8594); // rightwards arrow, U+2192 ISOnum 997 else if (c.equals("darr")) 998 s.append((char) 8595); // downwards arrow, U+2193 ISOnum 999 else if (c.equals("harr")) 1000 s.append((char) 8596); // left right arrow, U+2194 ISOamsa 1001 else if (c.equals("crarr")) 1002 s.append((char) 8629); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW 1003 else if (c.equals("lArr")) 1004 s.append((char) 8656); // leftwards double arrow, U+21D0 ISOtech 1005 else if (c.equals("uArr")) 1006 s.append((char) 8657); // upwards double arrow, U+21D1 ISOamsa 1007 else if (c.equals("rArr")) 1008 s.append((char) 8658); // rightwards double arrow, U+21D2 ISOtech 1009 else if (c.equals("dArr")) 1010 s.append((char) 8659); // downwards double arrow, U+21D3 ISOamsa 1011 else if (c.equals("hArr")) 1012 s.append((char) 8660); // left right double arrow, U+21D4 ISOamsa 1013 else if (c.equals("forall")) 1014 s.append((char) 8704); // for all, U+2200 ISOtech 1015 else if (c.equals("part")) 1016 s.append((char) 8706); // partial differential, U+2202 ISOtech 1017 else if (c.equals("exist")) 1018 s.append((char) 8707); // there exists, U+2203 ISOtech 1019 else if (c.equals("empty")) 1020 s.append((char) 8709); // empty set = null set = diameter, U+2205 ISOamso 1021 else if (c.equals("nabla")) 1022 s.append((char) 8711); // nabla = backward difference, U+2207 ISOtech 1023 else if (c.equals("isin")) 1024 s.append((char) 8712); // element of, U+2208 ISOtech 1025 else if (c.equals("notin")) 1026 s.append((char) 8713); // not an element of, U+2209 ISOtech 1027 else if (c.equals("ni")) 1028 s.append((char) 8715); // contains as member, U+220B ISOtech 1029 else if (c.equals("prod")) 1030 s.append((char) 8719); // n-ary product = product sign, U+220F ISOamsb 1031 else if (c.equals("sum")) 1032 s.append((char) 8721); // n-ary sumation, U+2211 ISOamsb 1033 else if (c.equals("minus")) 1034 s.append((char) 8722); // minus sign, U+2212 ISOtech 1035 else if (c.equals("lowast")) 1036 s.append((char) 8727); // asterisk operator, U+2217 ISOtech 1037 else if (c.equals("radic")) 1038 s.append((char) 8730); // square root = radical sign, U+221A ISOtech 1039 else if (c.equals("prop")) 1040 s.append((char) 8733); // proportional to, U+221D ISOtech 1041 else if (c.equals("infin")) 1042 s.append((char) 8734); // infinity, U+221E ISOtech --> 1043 else if (c.equals("ang")) 1044 s.append((char) 8736); // angle, U+2220 ISOamso 1045 else if (c.equals("and")) 1046 s.append((char) 8743); // logical and = wedge, U+2227 ISOtech 1047 else if (c.equals("or")) 1048 s.append((char) 8744); // logical or = vee, U+2228 ISOtech 1049 else if (c.equals("cap")) 1050 s.append((char) 8745); // intersection = cap, U+2229 ISOtech 1051 else if (c.equals("cup")) 1052 s.append((char) 8746); // union = cup, U+222A ISOtech 1053 else if (c.equals("int")) 1054 s.append((char) 8747); // integral, U+222B ISOtech 1055 else if (c.equals("there4")) 1056 s.append((char) 8756); // therefore, U+2234 ISOtech 1057 else if (c.equals("sim")) 1058 s.append((char) 8764); // tilde operator = varies with = similar t U+223C ISOtech 1059 else if (c.equals("cong")) 1060 s.append((char) 8773); // approximately equal to, U+2245 ISOtec 1061 else if (c.equals("asymp")) 1062 s.append((char) 8776); // almost equal to = asymptotic to, U+2248 ISOamsr 1063 else if (c.equals("ne")) 1064 s.append((char) 8800); // not equal to, U+2260 ISOtech 1065 else if (c.equals("equiv")) 1066 s.append((char) 8801); // identical to, U+2261 ISOtech 1067 else if (c.equals("le")) 1068 s.append((char) 8804); // less-than or equal to, U+2264 ISOtech 1069 else if (c.equals("ge")) 1070 s.append((char) 8805); // greater-than or equal to, U+2265 ISOtech 1071 else if (c.equals("sub")) 1072 s.append((char) 8834); // subset of, U+2282 ISOtech 1073 else if (c.equals("sup")) 1074 s.append((char) 8835); // superset of, U+2283 ISOtech 1075 else if (c.equals("nsub")) 1076 s.append((char) 8836); // not a subset of, U+2284 ISOamsn 1077 else if (c.equals("sube")) 1078 s.append((char) 8838); // subset of or equal to, U+2286 ISOtech 1079 else if (c.equals("supe")) 1080 s.append((char) 8839); // superset of or equal to, U+2287 ISOtech 1081 else if (c.equals("oplus")) 1082 s.append((char) 8853); // circled plus = direct sum, U+2295 ISOamsb 1083 else if (c.equals("otimes")) 1084 s.append((char) 8855); // circled times = vector product, U+2297 ISOamsb --> 1085 else if (c.equals("perp")) 1086 s.append((char) 8869); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech 1087 else if (c.equals("sdot")) 1088 s.append((char) 8901); // dot operator, U+22C5 ISOamsb 1089 else if (c.equals("lceil")) 1090 s.append((char) 8968); // left ceiling = apl upstile, U+2308 ISOamsc 1091 else if (c.equals("rceil")) 1092 s.append((char) 8969); // right ceiling, U+2309 ISOamsc 1093 else if (c.equals("lfloor")) 1094 s.append((char) 8970); // left floor = apl downstile, U+230A ISOamsc 1095 else if (c.equals("rfloor")) 1096 s.append((char) 8971); // right floor, U+230B ISOamsc 1097 else if (c.equals("lang")) 1098 s.append((char) 9001); // left-pointing angle bracket = bra, U+2329 ISOtech 1099 else if (c.equals("rang")) 1100 s.append((char) 9002); // right-pointing angle bracket = ket, U+232A ISOtech 1101 else if (c.equals("loz")) 1102 s.append((char) 9674); // lozenge, U+25CA ISOpub 1103 else if (c.equals("spades")) 1104 s.append((char) 9824); // black spade suit, U+2660 ISOpub 1105 else if (c.equals("clubs")) 1106 s.append((char) 9827); // black club suit = shamrock, U+2663 ISOpub 1107 else if (c.equals("hearts")) 1108 s.append((char) 9829); // black heart suit = valentine, U+2665 ISOpub 1109 else if (c.equals("diams")) 1110 s.append((char) 9830); // black diamond suit, U+2666 ISOpub -- 1111 else if (c.equals("ndash")) 1112 s.append((char) 8211); 1113 else if (c.equals("mdash")) 1114 s.append((char) 8212); 1115 else if (c.equals("ldquo")) 1116 s.append((char) 8221); 1117 else if (c.equals("rdquo")) 1118 s.append((char) 201D); 1119 else 1120 throw new FHIRFormatError("unable to parse character reference '" + c + "'' (last text = '" + lastText + "'" + descLoc()); 1121 } 1122 1123 private boolean isInteger(String s, int base) { 1124 try { 1125 Integer.parseInt(s, base); 1126 return true; 1127 } catch (Exception e) { 1128 return false; 1129 } 1130 } 1131 1132 public XhtmlNode parseFragment(String source) throws IOException, FHIRException { 1133 rdr = new StringReader(source); 1134 return parseFragment(); 1135 } 1136 1137 public XhtmlNode parseFragment(InputStream input) throws IOException, FHIRException { 1138 rdr = new InputStreamReader(input); 1139 return parseFragment(); 1140 } 1141 1142 private XhtmlNode parseFragment() throws IOException, FHIRException 1143 { 1144 skipWhiteSpace(); 1145 if (peekChar() != '<') 1146 throw new FHIRException("Unable to Parse HTML - does not start with tag. Found "+peekChar()+descLoc()); 1147 readChar(); 1148 if (peekChar() == '?') { 1149 readToTagEnd(); 1150 skipWhiteSpace(); 1151 if (peekChar() != '<') 1152 throw new FHIRException("Unable to Parse HTML - does not start with tag after processing instruction. Found "+peekChar()+descLoc()); 1153 readChar(); 1154 } 1155 String n = readName().toLowerCase(); 1156 readToTagEnd(); 1157 XhtmlNode result = new XhtmlNode(NodeType.Element); 1158 1159 int colonIndex = n.indexOf(':'); 1160 if (colonIndex != -1) { 1161 n = n.substring(colonIndex + 1); 1162 } 1163 1164 result.setName(n); 1165 unwindPoint = null; 1166 List<XhtmlNode> p = new ArrayList<XhtmlNode>(); 1167 parseElementInner(result, p, null, true); 1168 1169 return result; 1170 } 1171 1172 1173}