001// *************************************************************************************************************************** 002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file * 003// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * 004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance * 005// * with the License. You may obtain a copy of the License at * 006// * * 007// * http://www.apache.org/licenses/LICENSE-2.0 * 008// * * 009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an * 010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * 011// * specific language governing permissions and limitations under the License. * 012// *************************************************************************************************************************** 013package org.apache.juneau.xml; 014 015import static org.apache.juneau.internal.StringUtils.*; 016 017import java.io.*; 018import java.util.*; 019 020import javax.xml.stream.*; 021 022import org.apache.juneau.*; 023import org.apache.juneau.internal.*; 024import org.apache.juneau.xml.annotation.*; 025 026/** 027 * XML utility methods. 028 */ 029public final class XmlUtils { 030 031 //-------------------------------------------------------------------------------- 032 // XML element names 033 //-------------------------------------------------------------------------------- 034 035 /** 036 * Encodes any invalid XML element name characters to <code>_x####_</code> sequences. 037 * 038 * @param w The writer to send the output to. 039 * @param o The object being encoded. 040 * @return The same writer passed in. 041 * @throws IOException Throw by the writer. 042 */ 043 public static final Writer encodeElementName(Writer w, Object o) throws IOException { 044 045 if (o == null) 046 return w.append("_x0000_"); 047 048 String s = o.toString(); 049 050 if (needsElementNameEncoding(s)) 051 return encodeElementNameInner(w, s); 052 053 w.append(s); 054 return w; 055 } 056 057 /** 058 * Encodes any invalid XML element name characters to <code>_x####_</code> sequences. 059 * 060 * @param o The object being encoded. 061 * @return The encoded element name string. 062 */ 063 public static final String encodeElementName(Object o) { 064 if (o == null) 065 return "_x0000_"; 066 067 String s = o.toString(); 068 if (s.isEmpty()) 069 return "_xE000_"; 070 try { 071 if (needsElementNameEncoding(s)) 072 try (Writer w = new StringBuilderWriter(s.length() * 2)) { 073 return encodeElementNameInner(w, s).toString(); 074 } 075 } catch (IOException e) { 076 throw new RuntimeException(e); // Never happens 077 } 078 079 return s; 080 } 081 082 private static final Writer encodeElementNameInner(Writer w, String s) throws IOException { 083 for (int i = 0; i < s.length(); i++) { 084 char c = s.charAt(i); 085 if ((c >= 'A' && c <= 'Z') 086 || (c == '_' && ! isEscapeSequence(s,i)) 087 || (c >= 'a' && c <= 'z') 088 || (i != 0 && ( 089 c == '-' 090 || c == '.' 091 || (c >= '0' && c <= '9') 092 || c == '\u00b7' 093 || (c >= '\u0300' && c <= '\u036f') 094 || (c >= '\u203f' && c <= '\u2040') 095 )) 096 || (c >= '\u00c0' && c <= '\u00d6') 097 || (c >= '\u00d8' && c <= '\u00f6') 098 || (c >= '\u00f8' && c <= '\u02ff') 099 || (c >= '\u0370' && c <= '\u037d') 100 || (c >= '\u037f' && c <= '\u1fff') 101 || (c >= '\u200c' && c <= '\u200d') 102 || (c >= '\u2070' && c <= '\u218f') 103 || (c >= '\u2c00' && c <= '\u2fef') 104 || (c >= '\u3001' && c <= '\ud7ff') 105 || (c >= '\uf900' && c <= '\ufdcf') 106 || (c >= '\ufdf0' && c <= '\ufffd')) { 107 w.append(c); 108 } else { 109 appendPaddedHexChar(w, c); 110 } 111 } 112 return w; 113 } 114 115 private static final boolean needsElementNameEncoding(String s) { 116 // Note that this doesn't need to be perfect, just fast. 117 for (int i = 0; i < s.length(); i++) { 118 char c = s.charAt(i); 119 if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) 120 return true; 121 if (i == 0 && (c >= '0' && c <= '9')) 122 return true; 123 } 124 return false; 125 } 126 127 //-------------------------------------------------------------------------------- 128 // XML element text 129 //-------------------------------------------------------------------------------- 130 131 /** 132 * Escapes invalid XML text characters to <code>_x####_</code> sequences. 133 * 134 * @param o The object being encoded. 135 * @return The encoded string. 136 */ 137 public static final String escapeText(Object o) { 138 139 if (o == null) 140 return "_x0000_"; 141 142 String s = o.toString(); 143 144 try { 145 if (! needsTextEncoding(s)) 146 return s; 147 final int len = s.length(); 148 StringWriter sw = new StringWriter(s.length()*2); 149 for (int i = 0; i < len; i++) { 150 char c = s.charAt(i); 151 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 152 appendPaddedHexChar(sw, c); 153 else if (c == '_' && isEscapeSequence(s,i)) 154 appendPaddedHexChar(sw, c); 155 else if (isValidXmlCharacter(c)) 156 sw.append(c); 157 else 158 appendPaddedHexChar(sw, c); 159 } 160 return sw.toString(); 161 } catch (IOException e) { 162 throw new RuntimeException(e); // Never happens 163 } 164 } 165 166 /** 167 * Encodes the specified element text and sends the results to the specified writer. 168 * 169 * <p> 170 * Encodes any invalid XML text characters to <code>_x####_</code> sequences and sends the response to the specified 171 * writer. 172 * <br>Encodes <js>'&'</js>, <js>'<'</js>, and <js>'>'</js> as XML entities. 173 * <br>Encodes invalid XML text characters to <code>_x####_</code> sequences. 174 * 175 * @param w The writer to send the output to. 176 * @param o The object being encoded. 177 * @param trim Trim the text before serializing it. 178 * @param preserveWhitespace 179 * Specifies whether we're in preserve-whitespace mode. 180 * (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}. 181 * If <jk>true</jk>, leading and trailing whitespace characters will be encoded. 182 * @return The same writer passed in. 183 * @throws IOException Thrown from the writer. 184 */ 185 public static final Writer encodeText(Writer w, Object o, boolean trim, boolean preserveWhitespace) throws IOException { 186 187 if (o == null) 188 return w.append("_x0000_"); 189 190 String s = o.toString(); 191 if (s.isEmpty()) 192 return w.append("_xE000_"); 193 if (trim) 194 s = s.trim(); 195 196 if (needsTextEncoding(s)) { 197 final int len = s.length(); 198 for (int i = 0; i < len; i++) { 199 char c = s.charAt(i); 200 if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace) 201 appendPaddedHexChar(w, c); 202 else if (REPLACE_TEXT.contains(c)) 203 w.append(REPLACE_TEXT.get(c)); 204 else if (c == '_' && isEscapeSequence(s,i)) 205 appendPaddedHexChar(w, c); 206 else if (isValidXmlCharacter(c)) 207 w.append(c); 208 else 209 appendPaddedHexChar(w, c); 210 } 211 } else { 212 w.append(s); 213 } 214 215 return w; 216 } 217 218 private static final boolean needsTextEncoding(String s) { 219 // See if we need to convert the string. 220 // Conversion is somewhat expensive, so make sure we need to do so before hand. 221 final int len = s.length(); 222 for (int i = 0; i < len; i++) { 223 char c = s.charAt(i); 224 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 225 return true; 226 if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i))) 227 return true; 228 } 229 return false; 230 } 231 232 private static AsciiMap REPLACE_TEXT = new AsciiMap() 233 .append('&', "&") 234 .append('<', "<") 235 .append('>', ">") 236 .append((char)0x09, "	") 237 .append((char)0x0A, "
") 238 .append((char)0x0D, "
"); 239 240 241 //-------------------------------------------------------------------------------- 242 // XML attribute names 243 //-------------------------------------------------------------------------------- 244 245 /** 246 * Serializes and encodes the specified object as valid XML attribute name. 247 * 248 * @param w The writer to send the output to. 249 * @param o The object being serialized. 250 * @return This object (for method chaining). 251 * @throws IOException If a problem occurred. 252 */ 253 public static final Writer encodeAttrName(Writer w, Object o) throws IOException { 254 255 if (o == null) 256 return w.append("_x0000_"); 257 258 String s = o.toString(); 259 260 if (needsAttrNameEncoding(s)) { 261 for (int i = 0; i < s.length(); i++) { 262 char c = s.charAt(i); 263 if (i == 0) { 264 if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':') 265 w.append(c); 266 else if (c == '_' && ! isEscapeSequence(s,i)) 267 w.append(c); 268 else 269 appendPaddedHexChar(w, c); 270 } else { 271 if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')) 272 w.append(c); 273 else if (c == '_' && ! isEscapeSequence(s,i)) 274 w.append(c); 275 else 276 appendPaddedHexChar(w, c); 277 } 278 } 279 } else { 280 w.append(s); 281 } 282 283 return w; 284 } 285 286 private static final boolean needsAttrNameEncoding(String s) { 287 // Note that this doesn't need to be perfect, just fast. 288 for (int i = 0; i < s.length(); i++) { 289 char c = s.charAt(i); 290 if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) 291 return true; 292 if (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) 293 return true; 294 } 295 return false; 296 } 297 298 //-------------------------------------------------------------------------------- 299 // XML attribute values 300 //-------------------------------------------------------------------------------- 301 302 /** 303 * Encodes the specified attribute value and sends the results to the specified writer. 304 * 305 * <p> 306 * Encodes any invalid XML text characters to <code>_x####_</code> sequences and sends the response to the specified 307 * writer. 308 * <br>Encodes <js>'&'</js>, <js>'<'</js>, <js>'>'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities. 309 * <br>Encodes invalid XML text characters to <code>_x####_</code> sequences. 310 * 311 * @param w The writer to send the output to. 312 * @param o The object being encoded. 313 * @param trim 314 * Trim the text before serializing it. 315 * If <jk>true</jk>, leading and trailing whitespace characters will be encoded. 316 * @return The same writer passed in. 317 * @throws IOException Thrown from the writer. 318 */ 319 public static final Writer encodeAttrValue(Writer w, Object o, boolean trim) throws IOException { 320 if (o == null) 321 return w.append("_x0000_"); 322 323 String s = o.toString(); 324 if (s.isEmpty()) 325 return w; 326 if (trim) 327 s = s.trim(); 328 329 if (needsAttrValueEncoding(s)) { 330 final int len = s.length(); 331 for (int i = 0; i < len; i++) { 332 char c = s.charAt(i); 333 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 334 appendPaddedHexChar(w, c); 335 else if (REPLACE_ATTR_VAL.contains(c)) 336 w.append(REPLACE_ATTR_VAL.get(c)); 337 else if (c == '_' && isEscapeSequence(s,i)) 338 appendPaddedHexChar(w, c); 339 else if (isValidXmlCharacter(c)) 340 w.append(c); 341 else 342 appendPaddedHexChar(w, c); 343 } 344 } else { 345 w.append(s); 346 } 347 348 return w; 349 } 350 351 private static final boolean needsAttrValueEncoding(String s) { 352 // See if we need to convert the string. 353 // Conversion is somewhat expensive, so make sure we need to do so before hand. 354 final int len = s.length(); 355 for (int i = 0; i < len; i++) { 356 char c = s.charAt(i); 357 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 358 return true; 359 if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i))) 360 return true; 361 } 362 return false; 363 } 364 365 private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap() 366 .append('&', "&") 367 .append('<', "<") 368 .append('>', ">") 369 .append('"', """) 370 .append('\'', "'") 371 .append((char)0x09, "	") 372 .append((char)0x0A, "
") 373 .append((char)0x0D, "
"); 374 375 376 //-------------------------------------------------------------------------------- 377 // Decode XML text 378 //-------------------------------------------------------------------------------- 379 380 /** 381 * Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters. 382 * 383 * @param s The string being decoded. 384 * @param sb The string builder to use as a scratch pad. 385 * @return The decoded string. 386 */ 387 public static final String decode(String s, StringBuilder sb) { 388 if (s == null) return null; 389 if (s.length() == 0) 390 return s; 391 if (s.indexOf('_') == -1) 392 return s; 393 394 if (sb == null) 395 sb = new StringBuilder(s.length()); 396 for (int i = 0; i < s.length(); i++) { 397 char c = s.charAt(i); 398 if (c == '_' && isEscapeSequence(s,i)) { 399 400 int x = Integer.parseInt(s.substring(i+2, i+6), 16); 401 402 // If we find _x0000_, then that means a null. 403 // If we find _xE000_, then that means an empty string. 404 if (x == 0) 405 return null; 406 else if (x != 0xE000) 407 sb.append((char)x); 408 409 i+=6; 410 } else { 411 sb.append(c); 412 } 413 } 414 return sb.toString(); 415 } 416 417 418 /** 419 * Given a list of Strings and other Objects, combines Strings that are next to each other in the list. 420 * 421 * @param l The list of text nodes to collapse. 422 * @return The same list. 423 */ 424 public static LinkedList<Object> collapseTextNodes(LinkedList<Object> l) { 425 426 String prev = null; 427 for (ListIterator<Object> i = l.listIterator(); i.hasNext();) { 428 Object o = i.next(); 429 if (o instanceof String) { 430 if (prev == null) 431 prev = o.toString(); 432 else { 433 prev += o; 434 i.remove(); 435 i.previous(); 436 i.remove(); 437 i.add(prev); 438 } 439 } else { 440 prev = null; 441 } 442 } 443 return l; 444 } 445 446 //-------------------------------------------------------------------------------- 447 // Other methods 448 //-------------------------------------------------------------------------------- 449 450 // Returns true if the specified character can safely be used in XML text or an attribute. 451 private static final boolean isValidXmlCharacter(char c) { 452 return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD); 453 } 454 455 // Returns true if the string at the specified position is of the form "_x####_" 456 // where '#' are hexadecimal characters. 457 private static final boolean isEscapeSequence(String s, int i) { 458 return s.length() > i+6 459 && s.charAt(i) == '_' 460 && s.charAt(i+1) == 'x' 461 && isHexCharacter(s.charAt(i+2)) 462 && isHexCharacter(s.charAt(i+3)) 463 && isHexCharacter(s.charAt(i+4)) 464 && isHexCharacter(s.charAt(i+5)) 465 && s.charAt(i+6) == '_'; 466 } 467 468 // Returns true if the character is a hexadecimal character 469 private static final boolean isHexCharacter(char c) { 470 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'); 471 } 472 473 // Converts an integer to a hexadecimal string padded to 4 places. 474 private static final Writer appendPaddedHexChar(Writer out, int num) throws IOException { 475 out.append("_x"); 476 for (char c : toHex(num)) 477 out.append(c); 478 return out.append('_'); 479 } 480 481 /** 482 * Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations. 483 * 484 * <p> 485 * The annotations should be a child-to-parent ordering of annotations found on a class or method. 486 * 487 * @param xmls The list of <ja>@Xml</ja> annotations. 488 * @param schemas The list of <ja>@XmlSchema</ja> annotations. 489 * @return The namespace, or <jk>null</jk> if it couldn't be found. 490 */ 491 public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) { 492 493 for (Xml xml : xmls) { 494 Namespace ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas); 495 if (ns != null) 496 return ns; 497 } 498 499 for (XmlSchema schema : schemas) { 500 Namespace ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas); 501 if (ns != null) 502 return ns; 503 } 504 505 return null; 506 } 507 508 private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) { 509 510 // If both prefix and namespace specified, use that Namespace mapping. 511 if (! (prefix.isEmpty() || ns.isEmpty())) 512 return Namespace.create(prefix, ns); 513 514 // If only prefix specified, need to search for namespaceURI. 515 if (! prefix.isEmpty()) { 516 if (xmls != null) 517 for (Xml xml2 : xmls) 518 if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty()) 519 return Namespace.create(prefix, xml2.namespace()); 520 for (XmlSchema schema : schemas) { 521 if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty()) 522 return Namespace.create(prefix, schema.namespace()); 523 for (XmlNs xmlNs : schema.xmlNs()) 524 if (xmlNs.prefix().equals(prefix)) 525 return Namespace.create(prefix, xmlNs.namespaceURI()); 526 } 527 throw new BeanRuntimeException("Found @Xml.prefix annotation with no matching URI. prefix='"+prefix+"'"); 528 } 529 530 // If only namespaceURI specified, need to search for prefix. 531 if (! ns.isEmpty()) { 532 if (xmls != null) 533 for (Xml xml2 : xmls) 534 if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty()) 535 return Namespace.create(xml2.prefix(), ns); 536 for (XmlSchema schema : schemas) { 537 if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty()) 538 return Namespace.create(schema.prefix(), ns); 539 for (XmlNs xmlNs : schema.xmlNs()) 540 if (xmlNs.namespaceURI().equals(ns)) 541 return Namespace.create(xmlNs.prefix(), ns); 542 } 543 } 544 545 return null; 546 } 547 548 /** 549 * Utility method that converts the current event on the XML stream to something human-readable for debug purposes. 550 * 551 * @param r The XML stream reader whose current event is to be converted to a readable string. 552 * @return The event in human-readable form. 553 */ 554 public static final String toReadableEvent(XMLStreamReader r) { 555 int t = r.getEventType(); 556 if (t == 1) 557 return "<"+r.getLocalName()+">"; 558 if (t == 2) 559 return "</"+r.getLocalName()+">"; 560 if (t == 3) 561 return "PROCESSING_INSTRUCTION"; 562 if (t == 4) 563 return "CHARACTERS=[" + r.getText() + "]"; 564 if (t == 5) 565 return "COMMENTS=[" + r.getText() + "]"; 566 if (t == 6) 567 return "SPACE=[" + r.getText() + "]"; 568 if (t == 7) 569 return "START_DOCUMENT"; 570 if (t == 8) 571 return "END_DOCUMENT"; 572 if (t == 9) 573 return "ENTITY_REFERENCE"; 574 if (t == 10) 575 return "ATTRIBUTE"; 576 if (t == 11) 577 return "DTD"; 578 if (t == 12) 579 return "CDATA=["+r.getText()+"]"; 580 if (t == 13) 581 return "NAMESPACE"; 582 if (t == 14) 583 return "NOTATION_DECLARATION"; 584 if (t == 15) 585 return "ENTITY_DECLARATION"; 586 return "UNKNOWN"; 587 } 588}