001// ***************************************************************************************************************************
002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
003// * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
005// * with the License.  You may obtain a copy of the License at                                                              *
006// *                                                                                                                         *
007// *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
008// *                                                                                                                         *
009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
011// * specific language governing permissions and limitations under the License.                                              *
012// ***************************************************************************************************************************
013package org.apache.juneau.xml;
014
015import static org.apache.juneau.common.internal.StringUtils.*;
016import static org.apache.juneau.common.internal.ThrowableUtils.*;
017
018import java.io.*;
019import java.util.*;
020
021import javax.xml.stream.*;
022
023import org.apache.juneau.*;
024import org.apache.juneau.common.internal.*;
025import org.apache.juneau.internal.*;
026import org.apache.juneau.xml.annotation.*;
027
028/**
029 * XML utility methods.
030 *
031 * <h5 class='section'>See Also:</h5><ul>
032 *    <li class='link'><a class="doclink" href="../../../../index.html#jm.XmlDetails">XML Details</a>
033
034 * </ul>
035 */
036public final class XmlUtils {
037
038   //-----------------------------------------------------------------------------------------------------------------
039   // XML element names
040   //-----------------------------------------------------------------------------------------------------------------
041
042   /**
043    * Encodes any invalid XML element name characters to <c>_x####_</c> sequences.
044    *
045    * @param w The writer to send the output to.
046    * @param value The object being encoded.
047    * @return The same writer passed in.
048    */
049   public static Writer encodeElementName(Writer w, Object value) {
050      try {
051         if (value == null)
052            return w.append("_x0000_");
053         String s = value.toString();
054         if (needsElementNameEncoding(s))
055            return encodeElementNameInner(w, s);
056         w.append(s);
057      } catch (IOException e) {
058         throw asRuntimeException(e);
059      }
060      return w;
061   }
062
063   /**
064    * Encodes any invalid XML element name characters to <c>_x####_</c> sequences.
065    *
066    * @param value The object being encoded.
067    * @return The encoded element name string.
068    */
069   public static String encodeElementName(Object value) {
070      if (value == null)
071         return "_x0000_";
072      String s = value.toString();
073      if (s.isEmpty())
074         return "_xE000_";
075
076      try {
077         if (needsElementNameEncoding(s))
078            try (Writer w = new StringBuilderWriter(s.length() * 2)) {
079               return encodeElementNameInner(w, s).toString();
080            }
081      } catch (IOException e) {
082         throw asRuntimeException(e); // Never happens
083      }
084
085      return s;
086   }
087
088   private static Writer encodeElementNameInner(Writer w, String s) throws IOException {
089      for (int i = 0; i < s.length(); i++) {
090         char c = s.charAt(i);
091         if ((c >= 'A' && c <= 'Z')
092               || (c == '_' && ! isEscapeSequence(s,i))
093               || (c >= 'a' && c <= 'z')
094               || (i != 0 && (
095                     c == '-'
096                     || c == '.'
097                     || (c >= '0' && c <= '9')
098                     || c == '\u00b7'
099                     || (c >= '\u0300' && c <= '\u036f')
100                     || (c >= '\u203f' && c <= '\u2040')
101                  ))
102               || (c >= '\u00c0' && c <= '\u00d6')
103               || (c >= '\u00d8' && c <= '\u00f6')
104               || (c >= '\u00f8' && c <= '\u02ff')
105               || (c >= '\u0370' && c <= '\u037d')
106               || (c >= '\u037f' && c <= '\u1fff')
107               || (c >= '\u200c' && c <= '\u200d')
108               || (c >= '\u2070' && c <= '\u218f')
109               || (c >= '\u2c00' && c <= '\u2fef')
110               || (c >= '\u3001' && c <= '\ud7ff')
111               || (c >= '\uf900' && c <= '\ufdcf')
112               || (c >= '\ufdf0' && c <= '\ufffd')) {
113            w.append(c);
114         }  else {
115            appendPaddedHexChar(w, c);
116         }
117      }
118      return w;
119   }
120
121   private static boolean needsElementNameEncoding(String value) {
122      // Note that this doesn't need to be perfect, just fast.
123      for (int i = 0; i < value.length(); i++) {
124         char c = value.charAt(i);
125         if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && (c >= '0' && c <= '9')))
126            return true;
127      }
128      return false;
129   }
130
131   //-----------------------------------------------------------------------------------------------------------------
132   // XML element text
133   //-----------------------------------------------------------------------------------------------------------------
134
135   /**
136    * Escapes invalid XML text characters to <c>_x####_</c> sequences.
137    *
138    * @param value The object being encoded.
139    * @return The encoded string.
140    */
141   public static String escapeText(Object value) {
142      if (value == null)
143         return "_x0000_";
144      String s = value.toString();
145
146      try {
147         if (! needsTextEncoding(s))
148            return s;
149         final int len = s.length();
150         StringWriter sw = new StringWriter(s.length()*2);
151         for (int i = 0; i < len; i++) {
152            char c = s.charAt(i);
153            if ((i == 0 || i == len-1) && Character.isWhitespace(c))
154               appendPaddedHexChar(sw, c);
155            else if (c == '_' && isEscapeSequence(s,i))
156               appendPaddedHexChar(sw, c);
157            else if (isValidXmlCharacter(c))
158               sw.append(c);
159            else
160               appendPaddedHexChar(sw, c);
161         }
162         return sw.toString();
163      } catch (IOException e) {
164         throw asRuntimeException(e); // Never happens
165      }
166   }
167
168   /**
169    * Encodes the specified element text and sends the results to the specified writer.
170    *
171    * <p>
172    * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified
173    * writer.
174    * <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, and <js>'&gt;'</js> as XML entities.
175    * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences.
176    *
177    * @param w The writer to send the output to.
178    * @param value The object being encoded.
179    * @param trim Trim the text before serializing it.
180    * @param preserveWhitespace
181    *    Specifies whether we're in preserve-whitespace mode.
182    *    (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}.
183    *    If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
184    * @return The same writer passed in.
185    */
186   public static Writer encodeText(Writer w, Object value, boolean trim, boolean preserveWhitespace) {
187
188      try {
189         if (value == null)
190            return w.append("_x0000_");
191         String s = value.toString();
192         if (s.isEmpty())
193            return w.append("_xE000_");
194         if (trim)
195            s = s.trim();
196
197         if (needsTextEncoding(s)) {
198            final int len = s.length();
199            for (int i = 0; i < len; i++) {
200               char c = s.charAt(i);
201               if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace)
202                  appendPaddedHexChar(w, c);
203               else if (REPLACE_TEXT.contains(c))
204                  w.append(REPLACE_TEXT.get(c));
205               else if (c == '_' && isEscapeSequence(s,i))
206                  appendPaddedHexChar(w, c);
207               else if (isValidXmlCharacter(c))
208                  w.append(c);
209               else
210                  appendPaddedHexChar(w, c);
211            }
212         } else {
213            w.append(s);
214         }
215      } catch (IOException e) {
216         throw asRuntimeException(e);
217      }
218
219      return w;
220   }
221
222   private static boolean needsTextEncoding(String value) {
223      // See if we need to convert the string.
224      // Conversion is somewhat expensive, so make sure we need to do so before hand.
225      final int len = value.length();
226      for (int i = 0; i < len; i++) {
227         char c = value.charAt(i);
228         if ((i == 0 || i == len-1) && Character.isWhitespace(c))
229            return true;
230         if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value,i)))
231            return true;
232      }
233      return false;
234   }
235
236   private static AsciiMap REPLACE_TEXT = new AsciiMap()
237      .append('&', "&amp;")
238      .append('<', "&lt;")
239      .append('>', "&gt;")
240      .append((char)0x09, "&#x0009;")
241      .append((char)0x0A, "&#x000a;")
242      .append((char)0x0D, "&#x000d;");
243
244
245   //-----------------------------------------------------------------------------------------------------------------
246   // XML attribute names
247   //-----------------------------------------------------------------------------------------------------------------
248
249   /**
250    * Serializes and encodes the specified object as valid XML attribute name.
251    *
252    * @param w The writer to send the output to.
253    * @param value The object being serialized.
254    * @return This object.
255    * @throws IOException If a problem occurred.
256    */
257   public static Writer encodeAttrName(Writer w, Object value) throws IOException {
258      if (value == null)
259         return w.append("_x0000_");
260      String s = value.toString();
261
262      if (needsAttrNameEncoding(s)) {
263         for (int i = 0; i < s.length(); i++) {
264            char c = s.charAt(i);
265            if (i == 0) {
266               if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')
267                  w.append(c);
268               else if (c == '_' && ! isEscapeSequence(s,i))
269                  w.append(c);
270               else
271                  appendPaddedHexChar(w, c);
272            } else {
273               if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':'))
274                  w.append(c);
275               else if (c == '_' && ! isEscapeSequence(s,i))
276                  w.append(c);
277               else
278                  appendPaddedHexChar(w, c);
279            }
280         }
281      } else {
282         w.append(s);
283      }
284
285      return w;
286   }
287
288   private static boolean needsAttrNameEncoding(String value) {
289      // Note that this doesn't need to be perfect, just fast.
290      for (int i = 0; i < value.length(); i++) {
291         char c = value.charAt(i);
292         if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')))
293            return true;
294      }
295      return false;
296   }
297
298   //-----------------------------------------------------------------------------------------------------------------
299   // XML attribute values
300   //-----------------------------------------------------------------------------------------------------------------
301
302   /**
303    * Encodes the specified attribute value and sends the results to the specified writer.
304    *
305    * <p>
306    * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified
307    * writer.
308    * <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, <js>'&gt;'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities.
309    * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences.
310    *
311    * @param w The writer to send the output to.
312    * @param value The object being encoded.
313    * @param trim
314    *    Trim the text before serializing it.
315    *    If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
316    * @return The same writer passed in.
317    */
318   public static Writer encodeAttrValue(Writer w, Object value, boolean trim) {
319      try {
320         if (value == null)
321            return w.append("_x0000_");
322         String s = value.toString();
323         if (s.isEmpty())
324            return w;
325         if (trim)
326            s = s.trim();
327
328         if (needsAttrValueEncoding(s)) {
329            final int len = s.length();
330            for (int i = 0; i < len; i++) {
331               char c = s.charAt(i);
332               if ((i == 0 || i == len-1) && Character.isWhitespace(c))
333                  appendPaddedHexChar(w, c);
334               else if (REPLACE_ATTR_VAL.contains(c))
335                  w.append(REPLACE_ATTR_VAL.get(c));
336               else if (c == '_' && isEscapeSequence(s,i))
337                  appendPaddedHexChar(w, c);
338               else if (isValidXmlCharacter(c))
339                  w.append(c);
340               else
341                  appendPaddedHexChar(w, c);
342            }
343         } else {
344            w.append(s);
345         }
346      } catch (IOException e) {
347         throw asRuntimeException(e);
348      }
349
350      return w;
351   }
352
353   private static boolean needsAttrValueEncoding(String value) {
354      // See if we need to convert the string.
355      // Conversion is somewhat expensive, so make sure we need to do so before hand.
356      final int len = value.length();
357      for (int i = 0; i < len; i++) {
358         char c = value.charAt(i);
359         if ((i == 0 || i == len-1) && Character.isWhitespace(c))
360            return true;
361         if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value,i)))
362            return true;
363      }
364      return false;
365   }
366
367   private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap()
368      .append('&', "&amp;")
369      .append('<', "&lt;")
370      .append('>', "&gt;")
371      .append('"', "&quot;")
372      .append('\'', "&apos;")
373      .append((char)0x09, "&#x0009;")
374      .append((char)0x0A, "&#x000a;")
375      .append((char)0x0D, "&#x000d;");
376
377
378   //-----------------------------------------------------------------------------------------------------------------
379   // Decode XML text
380   //-----------------------------------------------------------------------------------------------------------------
381
382   /**
383    * Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters.
384    *
385    * @param value The string being decoded.
386    * @param sb The string builder to use as a scratch pad.
387    * @return The decoded string.
388    */
389   public static String decode(String value, StringBuilder sb) {
390      if (value == null)
391         return null;
392      if (value.isEmpty() || value.indexOf('_') == -1)
393         return value;
394      if (sb == null)
395         sb = new StringBuilder(value.length());
396
397      for (int i = 0; i < value.length(); i++) {
398         char c = value.charAt(i);
399         if (c == '_' && isEscapeSequence(value,i)) {
400
401            int x = Integer.parseInt(value.substring(i+2, i+6), 16);
402
403            // If we find _x0000_, then that means a null.
404            // If we find _xE000_, then that means an empty string.
405            if (x == 0)
406               return null;
407            else if (x != 0xE000)
408               sb.append((char)x);
409
410            i+=6;
411         } else {
412            sb.append(c);
413         }
414      }
415      return sb.toString();
416   }
417
418
419   /**
420    * Given a list of Strings and other Objects, combines Strings that are next to each other in the list.
421    *
422    * @param value The list of text nodes to collapse.
423    * @return The same list.
424    */
425   public static LinkedList<Object> collapseTextNodes(LinkedList<Object> value) {
426
427      String prev = null;
428      for (ListIterator<Object> i = value.listIterator(); i.hasNext();) {
429         Object o = i.next();
430         if (o instanceof String) {
431            if (prev == null)
432               prev = o.toString();
433            else {
434               prev += o;
435               i.remove();
436               i.previous();
437               i.remove();
438               i.add(prev);
439            }
440         } else {
441            prev = null;
442         }
443      }
444      return value;
445   }
446
447   //-----------------------------------------------------------------------------------------------------------------
448   // Other methods
449   //-----------------------------------------------------------------------------------------------------------------
450
451   // Returns true if the specified character can safely be used in XML text or an attribute.
452   private static boolean isValidXmlCharacter(char c) {
453      return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD);
454   }
455
456   // Returns true if the string at the specified position is of the form "_x####_"
457   // where '#' are hexadecimal characters.
458   private static boolean isEscapeSequence(String s, int i) {
459      return s.length() > i+6
460         && s.charAt(i) == '_'
461         && s.charAt(i+1) == 'x'
462         && isHexCharacter(s.charAt(i+2))
463         && isHexCharacter(s.charAt(i+3))
464         && isHexCharacter(s.charAt(i+4))
465         && isHexCharacter(s.charAt(i+5))
466         && s.charAt(i+6) == '_';
467   }
468
469   // Returns true if the character is a hexadecimal character
470   private static boolean isHexCharacter(char c) {
471      return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F');
472   }
473
474   // Converts an integer to a hexadecimal string padded to 4 places.
475   private static Writer appendPaddedHexChar(Writer out, int num) throws IOException {
476      out.append("_x");
477      for (char c : toHex4(num))
478         out.append(c);
479      return out.append('_');
480   }
481
482   /**
483    * Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations.
484    *
485    * <p>
486    * The annotations should be a parent-to-child ordering of annotations found on a class or method.
487    *
488    * @param xmls The list of <ja>@Xml</ja> annotations.
489    * @param schemas The list of <ja>@XmlSchema</ja> annotations.
490    * @return The namespace, or <jk>null</jk> if it couldn't be found.
491    */
492   public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) {
493
494      for (int i = xmls.size()-1; i >= 0; i--) {
495         Xml xml = xmls.get(i);
496         Namespace ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas);
497         if (ns != null)
498            return ns;
499      }
500
501      for (int i = schemas.size()-1; i >= 0; i--) {
502         XmlSchema schema = schemas.get(i);
503         Namespace ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas);
504         if (ns != null)
505            return ns;
506      }
507
508      return null;
509   }
510
511   private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) {
512
513      // If both prefix and namespace specified, use that Namespace mapping.
514      if (! (prefix.isEmpty() || ns.isEmpty()))
515         return Namespace.of(prefix, ns);
516
517      // If only prefix specified, need to search for namespaceURI.
518      if (! prefix.isEmpty()) {
519         if (xmls != null)
520            for (Xml xml2 : xmls)
521               if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty())
522                  return Namespace.of(prefix, xml2.namespace());
523         for (XmlSchema schema : schemas) {
524            if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty())
525               return Namespace.of(prefix, schema.namespace());
526            for (XmlNs xmlNs : schema.xmlNs())
527               if (xmlNs.prefix().equals(prefix))
528                  return Namespace.of(prefix, xmlNs.namespaceURI());
529         }
530         throw new BeanRuntimeException("Found @Xml.prefix annotation with no matching URI.  prefix='"+prefix+"'");
531      }
532
533      // If only namespaceURI specified, need to search for prefix.
534      if (! ns.isEmpty()) {
535         if (xmls != null)
536            for (Xml xml2 : xmls)
537               if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty())
538                  return Namespace.of(xml2.prefix(), ns);
539         for (XmlSchema schema : schemas) {
540            if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty())
541               return Namespace.of(schema.prefix(), ns);
542            for (XmlNs xmlNs : schema.xmlNs())
543               if (xmlNs.namespaceURI().equals(ns))
544                  return Namespace.of(xmlNs.prefix(), ns);
545         }
546      }
547
548      return null;
549   }
550
551   /**
552    * Utility method that converts the current event on the XML stream to something human-readable for debug purposes.
553    *
554    * @param r The XML stream reader whose current event is to be converted to a readable string.
555    * @return The event in human-readable form.
556    */
557   public static String toReadableEvent(XMLStreamReader r) {
558      int t = r.getEventType();
559      if (t == 1)
560         return "<"+r.getLocalName()+">";
561      if (t == 2)
562         return "</"+r.getLocalName()+">";
563      if (t == 3)
564         return "PROCESSING_INSTRUCTION";
565      if (t == 4)
566         return "CHARACTERS=[" + r.getText() + "]";
567      if (t == 5)
568         return "COMMENTS=[" + r.getText() + "]";
569      if (t == 6)
570         return "SPACE=[" + r.getText() + "]";
571      if (t == 7)
572         return "START_DOCUMENT";
573      if (t == 8)
574         return "END_DOCUMENT";
575      if (t == 9)
576         return "ENTITY_REFERENCE";
577      if (t == 10)
578         return "ATTRIBUTE";
579      if (t == 11)
580         return "DTD";
581      if (t == 12)
582         return "CDATA=["+r.getText()+"]";
583      if (t == 13)
584         return "NAMESPACE";
585      if (t == 14)
586         return "NOTATION_DECLARATION";
587      if (t == 15)
588         return "ENTITY_DECLARATION";
589      return "UNKNOWN";
590   }
591}