001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.juneau.xml; 018 019import static org.apache.juneau.common.utils.StringUtils.*; 020import static org.apache.juneau.common.utils.ThrowableUtils.*; 021 022import java.io.*; 023import java.util.*; 024 025import javax.xml.stream.*; 026 027import org.apache.juneau.*; 028import org.apache.juneau.common.utils.*; 029import org.apache.juneau.internal.*; 030import org.apache.juneau.xml.annotation.*; 031 032/** 033 * XML utility methods. 034 * 035 * <h5 class='section'>See Also:</h5><ul> 036 * <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/XmlBasics">XML Basics</a> 037 038 * </ul> 039 */ 040public class XmlUtils { 041 042 //----------------------------------------------------------------------------------------------------------------- 043 // XML element names 044 //----------------------------------------------------------------------------------------------------------------- 045 046 /** 047 * Encodes any invalid XML element name characters to <c>_x####_</c> sequences. 048 * 049 * @param w The writer to send the output to. 050 * @param value The object being encoded. 051 * @return The same writer passed in. 052 */ 053 public static Writer encodeElementName(Writer w, Object value) { 054 try { 055 if (value == null) 056 return w.append("_x0000_"); 057 String s = value.toString(); 058 if (needsElementNameEncoding(s)) 059 return encodeElementNameInner(w, s); 060 w.append(s); 061 } catch (IOException e) { 062 throw asRuntimeException(e); 063 } 064 return w; 065 } 066 067 /** 068 * Encodes any invalid XML element name characters to <c>_x####_</c> sequences. 069 * 070 * @param value The object being encoded. 071 * @return The encoded element name string. 072 */ 073 public static String encodeElementName(Object value) { 074 if (value == null) 075 return "_x0000_"; 076 String s = value.toString(); 077 if (s.isEmpty()) 078 return "_xE000_"; 079 080 try { 081 if (needsElementNameEncoding(s)) 082 try (Writer w = new StringBuilderWriter(s.length() * 2)) { 083 return encodeElementNameInner(w, s).toString(); 084 } 085 } catch (IOException e) { 086 throw asRuntimeException(e); // Never happens 087 } 088 089 return s; 090 } 091 092 private static Writer encodeElementNameInner(Writer w, String s) throws IOException { 093 for (int i = 0; i < s.length(); i++) { 094 char c = s.charAt(i); 095 if ((c >= 'A' && c <= 'Z') 096 || (c == '_' && ! isEscapeSequence(s,i)) 097 || (c >= 'a' && c <= 'z') 098 || (i != 0 && ( 099 c == '-' 100 || c == '.' 101 || (c >= '0' && c <= '9') 102 || c == '\u00b7' 103 || (c >= '\u0300' && c <= '\u036f') 104 || (c >= '\u203f' && c <= '\u2040') 105 )) 106 || (c >= '\u00c0' && c <= '\u00d6') 107 || (c >= '\u00d8' && c <= '\u00f6') 108 || (c >= '\u00f8' && c <= '\u02ff') 109 || (c >= '\u0370' && c <= '\u037d') 110 || (c >= '\u037f' && c <= '\u1fff') 111 || (c >= '\u200c' && c <= '\u200d') 112 || (c >= '\u2070' && c <= '\u218f') 113 || (c >= '\u2c00' && c <= '\u2fef') 114 || (c >= '\u3001' && c <= '\ud7ff') 115 || (c >= '\uf900' && c <= '\ufdcf') 116 || (c >= '\ufdf0' && c <= '\ufffd')) { 117 w.append(c); 118 } else { 119 appendPaddedHexChar(w, c); 120 } 121 } 122 return w; 123 } 124 125 private static boolean needsElementNameEncoding(String value) { 126 // Note that this doesn't need to be perfect, just fast. 127 for (int i = 0; i < value.length(); i++) { 128 char c = value.charAt(i); 129 if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && (c >= '0' && c <= '9'))) 130 return true; 131 } 132 return false; 133 } 134 135 //----------------------------------------------------------------------------------------------------------------- 136 // XML element text 137 //----------------------------------------------------------------------------------------------------------------- 138 139 /** 140 * Escapes invalid XML text characters to <c>_x####_</c> sequences. 141 * 142 * @param value The object being encoded. 143 * @return The encoded string. 144 */ 145 public static String escapeText(Object value) { 146 if (value == null) 147 return "_x0000_"; 148 String s = value.toString(); 149 150 try { 151 if (! needsTextEncoding(s)) 152 return s; 153 final int len = s.length(); 154 StringWriter sw = new StringWriter(s.length()*2); 155 for (int i = 0; i < len; i++) { 156 char c = s.charAt(i); 157 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 158 appendPaddedHexChar(sw, c); 159 else if (c == '_' && isEscapeSequence(s,i)) 160 appendPaddedHexChar(sw, c); 161 else if (isValidXmlCharacter(c)) 162 sw.append(c); 163 else 164 appendPaddedHexChar(sw, c); 165 } 166 return sw.toString(); 167 } catch (IOException e) { 168 throw asRuntimeException(e); // Never happens 169 } 170 } 171 172 /** 173 * Encodes the specified element text and sends the results to the specified writer. 174 * 175 * <p> 176 * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified 177 * writer. 178 * <br>Encodes <js>'&'</js>, <js>'<'</js>, and <js>'>'</js> as XML entities. 179 * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences. 180 * 181 * @param w The writer to send the output to. 182 * @param value The object being encoded. 183 * @param trim Trim the text before serializing it. 184 * @param preserveWhitespace 185 * Specifies whether we're in preserve-whitespace mode. 186 * (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}. 187 * If <jk>true</jk>, leading and trailing whitespace characters will be encoded. 188 * @return The same writer passed in. 189 */ 190 public static Writer encodeText(Writer w, Object value, boolean trim, boolean preserveWhitespace) { 191 192 try { 193 if (value == null) 194 return w.append("_x0000_"); 195 String s = value.toString(); 196 if (s.isEmpty()) 197 return w.append("_xE000_"); 198 if (trim) 199 s = s.trim(); 200 201 if (needsTextEncoding(s)) { 202 final int len = s.length(); 203 for (int i = 0; i < len; i++) { 204 char c = s.charAt(i); 205 if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace) 206 appendPaddedHexChar(w, c); 207 else if (REPLACE_TEXT.contains(c)) 208 w.append(REPLACE_TEXT.get(c)); 209 else if (c == '_' && isEscapeSequence(s,i)) 210 appendPaddedHexChar(w, c); 211 else if (isValidXmlCharacter(c)) 212 w.append(c); 213 else 214 appendPaddedHexChar(w, c); 215 } 216 } else { 217 w.append(s); 218 } 219 } catch (IOException e) { 220 throw asRuntimeException(e); 221 } 222 223 return w; 224 } 225 226 private static boolean needsTextEncoding(String value) { 227 // See if we need to convert the string. 228 // Conversion is somewhat expensive, so make sure we need to do so before hand. 229 final int len = value.length(); 230 for (int i = 0; i < len; i++) { 231 char c = value.charAt(i); 232 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 233 return true; 234 if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value,i))) 235 return true; 236 } 237 return false; 238 } 239 240 private static AsciiMap REPLACE_TEXT = new AsciiMap() 241 .append('&', "&") 242 .append('<', "<") 243 .append('>', ">") 244 .append((char)0x09, "	") 245 .append((char)0x0A, "
") 246 .append((char)0x0D, "
"); 247 248 249 //----------------------------------------------------------------------------------------------------------------- 250 // XML attribute names 251 //----------------------------------------------------------------------------------------------------------------- 252 253 /** 254 * Serializes and encodes the specified object as valid XML attribute name. 255 * 256 * @param w The writer to send the output to. 257 * @param value The object being serialized. 258 * @return This object. 259 * @throws IOException If a problem occurred. 260 */ 261 public static Writer encodeAttrName(Writer w, Object value) throws IOException { 262 if (value == null) 263 return w.append("_x0000_"); 264 String s = value.toString(); 265 266 if (needsAttrNameEncoding(s)) { 267 for (int i = 0; i < s.length(); i++) { 268 char c = s.charAt(i); 269 if (i == 0) { 270 if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':') 271 w.append(c); 272 else if (c == '_' && ! isEscapeSequence(s,i)) 273 w.append(c); 274 else 275 appendPaddedHexChar(w, c); 276 } else { 277 if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')) 278 w.append(c); 279 else if (c == '_' && ! isEscapeSequence(s,i)) 280 w.append(c); 281 else 282 appendPaddedHexChar(w, c); 283 } 284 } 285 } else { 286 w.append(s); 287 } 288 289 return w; 290 } 291 292 private static boolean needsAttrNameEncoding(String value) { 293 // Note that this doesn't need to be perfect, just fast. 294 for (int i = 0; i < value.length(); i++) { 295 char c = value.charAt(i); 296 if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))) 297 return true; 298 } 299 return false; 300 } 301 302 //----------------------------------------------------------------------------------------------------------------- 303 // XML attribute values 304 //----------------------------------------------------------------------------------------------------------------- 305 306 /** 307 * Encodes the specified attribute value and sends the results to the specified writer. 308 * 309 * <p> 310 * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified 311 * writer. 312 * <br>Encodes <js>'&'</js>, <js>'<'</js>, <js>'>'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities. 313 * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences. 314 * 315 * @param w The writer to send the output to. 316 * @param value The object being encoded. 317 * @param trim 318 * Trim the text before serializing it. 319 * If <jk>true</jk>, leading and trailing whitespace characters will be encoded. 320 * @return The same writer passed in. 321 */ 322 public static Writer encodeAttrValue(Writer w, Object value, boolean trim) { 323 try { 324 if (value == null) 325 return w.append("_x0000_"); 326 String s = value.toString(); 327 if (s.isEmpty()) 328 return w; 329 if (trim) 330 s = s.trim(); 331 332 if (needsAttrValueEncoding(s)) { 333 final int len = s.length(); 334 for (int i = 0; i < len; i++) { 335 char c = s.charAt(i); 336 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 337 appendPaddedHexChar(w, c); 338 else if (REPLACE_ATTR_VAL.contains(c)) 339 w.append(REPLACE_ATTR_VAL.get(c)); 340 else if (c == '_' && isEscapeSequence(s,i)) 341 appendPaddedHexChar(w, c); 342 else if (isValidXmlCharacter(c)) 343 w.append(c); 344 else 345 appendPaddedHexChar(w, c); 346 } 347 } else { 348 w.append(s); 349 } 350 } catch (IOException e) { 351 throw asRuntimeException(e); 352 } 353 354 return w; 355 } 356 357 private static boolean needsAttrValueEncoding(String value) { 358 // See if we need to convert the string. 359 // Conversion is somewhat expensive, so make sure we need to do so before hand. 360 final int len = value.length(); 361 for (int i = 0; i < len; i++) { 362 char c = value.charAt(i); 363 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 364 return true; 365 if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value,i))) 366 return true; 367 } 368 return false; 369 } 370 371 private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap() 372 .append('&', "&") 373 .append('<', "<") 374 .append('>', ">") 375 .append('"', """) 376 .append('\'', "'") 377 .append((char)0x09, "	") 378 .append((char)0x0A, "
") 379 .append((char)0x0D, "
"); 380 381 382 //----------------------------------------------------------------------------------------------------------------- 383 // Decode XML text 384 //----------------------------------------------------------------------------------------------------------------- 385 386 /** 387 * Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters. 388 * 389 * @param value The string being decoded. 390 * @param sb The string builder to use as a scratch pad. 391 * @return The decoded string. 392 */ 393 public static String decode(String value, StringBuilder sb) { 394 if (value == null) 395 return null; 396 if (value.isEmpty() || value.indexOf('_') == -1) 397 return value; 398 if (sb == null) 399 sb = new StringBuilder(value.length()); 400 401 for (int i = 0; i < value.length(); i++) { 402 char c = value.charAt(i); 403 if (c == '_' && isEscapeSequence(value,i)) { 404 405 int x = Integer.parseInt(value.substring(i+2, i+6), 16); 406 407 // If we find _x0000_, then that means a null. 408 // If we find _xE000_, then that means an empty string. 409 if (x == 0) 410 return null; 411 else if (x != 0xE000) 412 sb.append((char)x); 413 414 i+=6; 415 } else { 416 sb.append(c); 417 } 418 } 419 return sb.toString(); 420 } 421 422 423 /** 424 * Given a list of Strings and other Objects, combines Strings that are next to each other in the list. 425 * 426 * @param value The list of text nodes to collapse. 427 * @return The same list. 428 */ 429 public static LinkedList<Object> collapseTextNodes(LinkedList<Object> value) { 430 431 String prev = null; 432 for (ListIterator<Object> i = value.listIterator(); i.hasNext();) { 433 Object o = i.next(); 434 if (o instanceof String) { 435 if (prev == null) 436 prev = o.toString(); 437 else { 438 prev += o; 439 i.remove(); 440 i.previous(); 441 i.remove(); 442 i.add(prev); 443 } 444 } else { 445 prev = null; 446 } 447 } 448 return value; 449 } 450 451 //----------------------------------------------------------------------------------------------------------------- 452 // Other methods 453 //----------------------------------------------------------------------------------------------------------------- 454 455 // Returns true if the specified character can safely be used in XML text or an attribute. 456 private static boolean isValidXmlCharacter(char c) { 457 return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD); 458 } 459 460 // Returns true if the string at the specified position is of the form "_x####_" 461 // where '#' are hexadecimal characters. 462 private static boolean isEscapeSequence(String s, int i) { 463 return s.length() > i+6 464 && s.charAt(i) == '_' 465 && s.charAt(i+1) == 'x' 466 && isHexCharacter(s.charAt(i+2)) 467 && isHexCharacter(s.charAt(i+3)) 468 && isHexCharacter(s.charAt(i+4)) 469 && isHexCharacter(s.charAt(i+5)) 470 && s.charAt(i+6) == '_'; 471 } 472 473 // Returns true if the character is a hexadecimal character 474 private static boolean isHexCharacter(char c) { 475 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'); 476 } 477 478 // Converts an integer to a hexadecimal string padded to 4 places. 479 private static Writer appendPaddedHexChar(Writer out, int num) throws IOException { 480 out.append("_x"); 481 for (char c : toHex4(num)) 482 out.append(c); 483 return out.append('_'); 484 } 485 486 /** 487 * Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations. 488 * 489 * <p> 490 * The annotations should be a parent-to-child ordering of annotations found on a class or method. 491 * 492 * @param xmls The list of <ja>@Xml</ja> annotations. 493 * @param schemas The list of <ja>@XmlSchema</ja> annotations. 494 * @return The namespace, or <jk>null</jk> if it couldn't be found. 495 */ 496 public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) { 497 498 for (int i = xmls.size()-1; i >= 0; i--) { 499 Xml xml = xmls.get(i); 500 Namespace ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas); 501 if (ns != null) 502 return ns; 503 } 504 505 for (int i = schemas.size()-1; i >= 0; i--) { 506 XmlSchema schema = schemas.get(i); 507 Namespace ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas); 508 if (ns != null) 509 return ns; 510 } 511 512 return null; 513 } 514 515 private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) { 516 517 // If both prefix and namespace specified, use that Namespace mapping. 518 if (! (prefix.isEmpty() || ns.isEmpty())) 519 return Namespace.of(prefix, ns); 520 521 // If only prefix specified, need to search for namespaceURI. 522 if (! prefix.isEmpty()) { 523 if (xmls != null) 524 for (Xml xml2 : xmls) 525 if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty()) 526 return Namespace.of(prefix, xml2.namespace()); 527 for (XmlSchema schema : schemas) { 528 if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty()) 529 return Namespace.of(prefix, schema.namespace()); 530 for (XmlNs xmlNs : schema.xmlNs()) 531 if (xmlNs.prefix().equals(prefix)) 532 return Namespace.of(prefix, xmlNs.namespaceURI()); 533 } 534 throw new BeanRuntimeException("Found @Xml.prefix annotation with no matching URI. prefix='"+prefix+"'"); 535 } 536 537 // If only namespaceURI specified, need to search for prefix. 538 if (! ns.isEmpty()) { 539 if (xmls != null) 540 for (Xml xml2 : xmls) 541 if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty()) 542 return Namespace.of(xml2.prefix(), ns); 543 for (XmlSchema schema : schemas) { 544 if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty()) 545 return Namespace.of(schema.prefix(), ns); 546 for (XmlNs xmlNs : schema.xmlNs()) 547 if (xmlNs.namespaceURI().equals(ns)) 548 return Namespace.of(xmlNs.prefix(), ns); 549 } 550 } 551 552 return null; 553 } 554 555 /** 556 * Utility method that converts the current event on the XML stream to something human-readable for debug purposes. 557 * 558 * @param r The XML stream reader whose current event is to be converted to a readable string. 559 * @return The event in human-readable form. 560 */ 561 public static String toReadableEvent(XMLStreamReader r) { 562 int t = r.getEventType(); 563 if (t == 1) 564 return "<"+r.getLocalName()+">"; 565 if (t == 2) 566 return "</"+r.getLocalName()+">"; 567 if (t == 3) 568 return "PROCESSING_INSTRUCTION"; 569 if (t == 4) 570 return "CHARACTERS=[" + r.getText() + "]"; 571 if (t == 5) 572 return "COMMENTS=[" + r.getText() + "]"; 573 if (t == 6) 574 return "SPACE=[" + r.getText() + "]"; 575 if (t == 7) 576 return "START_DOCUMENT"; 577 if (t == 8) 578 return "END_DOCUMENT"; 579 if (t == 9) 580 return "ENTITY_REFERENCE"; 581 if (t == 10) 582 return "ATTRIBUTE"; 583 if (t == 11) 584 return "DTD"; 585 if (t == 12) 586 return "CDATA=["+r.getText()+"]"; 587 if (t == 13) 588 return "NAMESPACE"; 589 if (t == 14) 590 return "NOTATION_DECLARATION"; 591 if (t == 15) 592 return "ENTITY_DECLARATION"; 593 return "UNKNOWN"; 594 } 595}