001// ***************************************************************************************************************************
002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
003// * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
005// * with the License.  You may obtain a copy of the License at                                                              *
006// *                                                                                                                         *
007// *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
008// *                                                                                                                         *
009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
011// * specific language governing permissions and limitations under the License.                                              *
012// ***************************************************************************************************************************
013package org.apache.juneau.xml;
014
015import static org.apache.juneau.internal.StringUtils.*;
016
017import java.io.*;
018import java.util.*;
019
020import javax.xml.stream.*;
021
022import org.apache.juneau.*;
023import org.apache.juneau.internal.*;
024import org.apache.juneau.xml.annotation.*;
025
026/**
027 * XML utility methods.
028 */
029public final class XmlUtils {
030
031   //-----------------------------------------------------------------------------------------------------------------
032   // XML element names
033   //-----------------------------------------------------------------------------------------------------------------
034
035   /**
036    * Encodes any invalid XML element name characters to <code>_x####_</code> sequences.
037    *
038    * @param w The writer to send the output to.
039    * @param o The object being encoded.
040    * @return The same writer passed in.
041    * @throws IOException Throw by the writer.
042    */
043   public static final Writer encodeElementName(Writer w, Object o) throws IOException {
044
045      if (o == null)
046         return w.append("_x0000_");
047
048      String s = o.toString();
049
050      if (needsElementNameEncoding(s))
051         return encodeElementNameInner(w, s);
052
053      w.append(s);
054      return w;
055   }
056
057   /**
058    * Encodes any invalid XML element name characters to <code>_x####_</code> sequences.
059    *
060    * @param o The object being encoded.
061    * @return The encoded element name string.
062    */
063   public static final String encodeElementName(Object o) {
064      if (o == null)
065         return "_x0000_";
066
067      String s = o.toString();
068      if (s.isEmpty())
069         return "_xE000_";
070      try {
071         if (needsElementNameEncoding(s))
072            try (Writer w = new StringBuilderWriter(s.length() * 2)) {
073               return encodeElementNameInner(w, s).toString();
074            }
075      } catch (IOException e) {
076         throw new RuntimeException(e); // Never happens
077      }
078
079      return s;
080   }
081
082   private static final Writer encodeElementNameInner(Writer w, String s) throws IOException {
083      for (int i = 0; i < s.length(); i++) {
084         char c = s.charAt(i);
085         if ((c >= 'A' && c <= 'Z')
086               || (c == '_' && ! isEscapeSequence(s,i))
087               || (c >= 'a' && c <= 'z')
088               || (i != 0 && (
089                     c == '-'
090                     || c == '.'
091                     || (c >= '0' && c <= '9')
092                     || c == '\u00b7'
093                     || (c >= '\u0300' && c <= '\u036f')
094                     || (c >= '\u203f' && c <= '\u2040')
095                  ))
096               || (c >= '\u00c0' && c <= '\u00d6')
097               || (c >= '\u00d8' && c <= '\u00f6')
098               || (c >= '\u00f8' && c <= '\u02ff')
099               || (c >= '\u0370' && c <= '\u037d')
100               || (c >= '\u037f' && c <= '\u1fff')
101               || (c >= '\u200c' && c <= '\u200d')
102               || (c >= '\u2070' && c <= '\u218f')
103               || (c >= '\u2c00' && c <= '\u2fef')
104               || (c >= '\u3001' && c <= '\ud7ff')
105               || (c >= '\uf900' && c <= '\ufdcf')
106               || (c >= '\ufdf0' && c <= '\ufffd')) {
107            w.append(c);
108         }  else {
109            appendPaddedHexChar(w, c);
110         }
111      }
112      return w;
113   }
114
115   private static final boolean needsElementNameEncoding(String s) {
116      // Note that this doesn't need to be perfect, just fast.
117      for (int i = 0; i < s.length(); i++) {
118         char c = s.charAt(i);
119         if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
120            return true;
121         if (i == 0 && (c >= '0' && c <= '9'))
122            return true;
123      }
124      return false;
125   }
126
127   //-----------------------------------------------------------------------------------------------------------------
128   // XML element text
129   //-----------------------------------------------------------------------------------------------------------------
130
131   /**
132    * Escapes invalid XML text characters to <code>_x####_</code> sequences.
133    *
134    * @param o The object being encoded.
135    * @return The encoded string.
136    */
137   public static final String escapeText(Object o) {
138
139      if (o == null)
140         return "_x0000_";
141
142      String s = o.toString();
143
144      try {
145         if (! needsTextEncoding(s))
146            return s;
147         final int len = s.length();
148         StringWriter sw = new StringWriter(s.length()*2);
149         for (int i = 0; i < len; i++) {
150            char c = s.charAt(i);
151            if ((i == 0 || i == len-1) && Character.isWhitespace(c))
152               appendPaddedHexChar(sw, c);
153            else if (c == '_' && isEscapeSequence(s,i))
154               appendPaddedHexChar(sw, c);
155            else if (isValidXmlCharacter(c))
156               sw.append(c);
157            else
158               appendPaddedHexChar(sw, c);
159         }
160         return sw.toString();
161      } catch (IOException e) {
162         throw new RuntimeException(e); // Never happens
163      }
164   }
165
166   /**
167    * Encodes the specified element text and sends the results to the specified writer.
168    *
169    * <p>
170    * Encodes any invalid XML text characters to <code>_x####_</code> sequences and sends the response to the specified
171    * writer.
172    * <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, and <js>'&gt;'</js> as XML entities.
173    * <br>Encodes invalid XML text characters to <code>_x####_</code> sequences.
174    *
175    * @param w The writer to send the output to.
176    * @param o The object being encoded.
177    * @param trim Trim the text before serializing it.
178    * @param preserveWhitespace
179    *    Specifies whether we're in preserve-whitespace mode.
180    *    (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}.
181    *    If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
182    * @return The same writer passed in.
183    * @throws IOException Thrown from the writer.
184    */
185   public static final Writer encodeText(Writer w, Object o, boolean trim, boolean preserveWhitespace) throws IOException {
186
187      if (o == null)
188         return w.append("_x0000_");
189
190      String s = o.toString();
191      if (s.isEmpty())
192         return w.append("_xE000_");
193      if (trim)
194         s = s.trim();
195
196      if (needsTextEncoding(s)) {
197         final int len = s.length();
198         for (int i = 0; i < len; i++) {
199            char c = s.charAt(i);
200            if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace)
201               appendPaddedHexChar(w, c);
202            else if (REPLACE_TEXT.contains(c))
203               w.append(REPLACE_TEXT.get(c));
204            else if (c == '_' && isEscapeSequence(s,i))
205               appendPaddedHexChar(w, c);
206            else if (isValidXmlCharacter(c))
207               w.append(c);
208            else
209               appendPaddedHexChar(w, c);
210         }
211      } else {
212         w.append(s);
213      }
214
215      return w;
216   }
217
218   private static final boolean needsTextEncoding(String s) {
219      // See if we need to convert the string.
220      // Conversion is somewhat expensive, so make sure we need to do so before hand.
221      final int len = s.length();
222      for (int i = 0; i < len; i++) {
223         char c = s.charAt(i);
224         if ((i == 0 || i == len-1) && Character.isWhitespace(c))
225            return true;
226         if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i)))
227            return true;
228      }
229      return false;
230   }
231
232   private static AsciiMap REPLACE_TEXT = new AsciiMap()
233      .append('&', "&amp;")
234      .append('<', "&lt;")
235      .append('>', "&gt;")
236      .append((char)0x09, "&#x0009;")
237      .append((char)0x0A, "&#x000a;")
238      .append((char)0x0D, "&#x000d;");
239
240
241   //-----------------------------------------------------------------------------------------------------------------
242   // XML attribute names
243   //-----------------------------------------------------------------------------------------------------------------
244
245   /**
246    * Serializes and encodes the specified object as valid XML attribute name.
247    *
248    * @param w The writer to send the output to.
249    * @param o The object being serialized.
250    * @return This object (for method chaining).
251    * @throws IOException If a problem occurred.
252    */
253   public static final Writer encodeAttrName(Writer w, Object o) throws IOException {
254
255      if (o == null)
256         return w.append("_x0000_");
257
258      String s = o.toString();
259
260      if (needsAttrNameEncoding(s)) {
261         for (int i = 0; i < s.length(); i++) {
262            char c = s.charAt(i);
263            if (i == 0) {
264               if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')
265                  w.append(c);
266               else if (c == '_' && ! isEscapeSequence(s,i))
267                  w.append(c);
268               else
269                  appendPaddedHexChar(w, c);
270            } else {
271               if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':'))
272                  w.append(c);
273               else if (c == '_' && ! isEscapeSequence(s,i))
274                  w.append(c);
275               else
276                  appendPaddedHexChar(w, c);
277            }
278         }
279      } else {
280         w.append(s);
281      }
282
283      return w;
284   }
285
286   private static final boolean needsAttrNameEncoding(String s) {
287      // Note that this doesn't need to be perfect, just fast.
288      for (int i = 0; i < s.length(); i++) {
289         char c = s.charAt(i);
290         if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
291            return true;
292         if (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
293            return true;
294      }
295      return false;
296   }
297
298   //-----------------------------------------------------------------------------------------------------------------
299   // XML attribute values
300   //-----------------------------------------------------------------------------------------------------------------
301
302   /**
303    * Encodes the specified attribute value and sends the results to the specified writer.
304    *
305    * <p>
306    * Encodes any invalid XML text characters to <code>_x####_</code> sequences and sends the response to the specified
307    * writer.
308    * <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, <js>'&gt;'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities.
309    * <br>Encodes invalid XML text characters to <code>_x####_</code> sequences.
310    *
311    * @param w The writer to send the output to.
312    * @param o The object being encoded.
313    * @param trim
314    *    Trim the text before serializing it.
315    *    If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
316    * @return The same writer passed in.
317    * @throws IOException Thrown from the writer.
318    */
319   public static final Writer encodeAttrValue(Writer w, Object o, boolean trim) throws IOException {
320      if (o == null)
321         return w.append("_x0000_");
322
323      String s = o.toString();
324      if (s.isEmpty())
325         return w;
326      if (trim)
327         s = s.trim();
328
329      if (needsAttrValueEncoding(s)) {
330         final int len = s.length();
331         for (int i = 0; i < len; i++) {
332            char c = s.charAt(i);
333            if ((i == 0 || i == len-1) && Character.isWhitespace(c))
334               appendPaddedHexChar(w, c);
335            else if (REPLACE_ATTR_VAL.contains(c))
336               w.append(REPLACE_ATTR_VAL.get(c));
337            else if (c == '_' && isEscapeSequence(s,i))
338               appendPaddedHexChar(w, c);
339            else if (isValidXmlCharacter(c))
340               w.append(c);
341            else
342               appendPaddedHexChar(w, c);
343         }
344      } else {
345         w.append(s);
346      }
347
348      return w;
349   }
350
351   private static final boolean needsAttrValueEncoding(String s) {
352      // See if we need to convert the string.
353      // Conversion is somewhat expensive, so make sure we need to do so before hand.
354      final int len = s.length();
355      for (int i = 0; i < len; i++) {
356         char c = s.charAt(i);
357         if ((i == 0 || i == len-1) && Character.isWhitespace(c))
358            return true;
359         if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i)))
360            return true;
361      }
362      return false;
363   }
364
365   private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap()
366      .append('&', "&amp;")
367      .append('<', "&lt;")
368      .append('>', "&gt;")
369      .append('"', "&quot;")
370      .append('\'', "&apos;")
371      .append((char)0x09, "&#x0009;")
372      .append((char)0x0A, "&#x000a;")
373      .append((char)0x0D, "&#x000d;");
374
375
376   //-----------------------------------------------------------------------------------------------------------------
377   // Decode XML text
378   //-----------------------------------------------------------------------------------------------------------------
379
380   /**
381    * Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters.
382    *
383    * @param s The string being decoded.
384    * @param sb The string builder to use as a scratch pad.
385    * @return The decoded string.
386    */
387   public static final String decode(String s, StringBuilder sb) {
388      if (s == null) return null;
389      if (s.length() == 0)
390         return s;
391      if (s.indexOf('_') == -1)
392         return s;
393
394      if (sb == null)
395         sb = new StringBuilder(s.length());
396      for (int i = 0; i < s.length(); i++) {
397         char c = s.charAt(i);
398         if (c == '_' && isEscapeSequence(s,i)) {
399
400            int x = Integer.parseInt(s.substring(i+2, i+6), 16);
401
402            // If we find _x0000_, then that means a null.
403            // If we find _xE000_, then that means an empty string.
404            if (x == 0)
405               return null;
406            else if (x != 0xE000)
407               sb.append((char)x);
408
409            i+=6;
410         } else {
411            sb.append(c);
412         }
413      }
414      return sb.toString();
415   }
416
417
418   /**
419    * Given a list of Strings and other Objects, combines Strings that are next to each other in the list.
420    *
421    * @param l The list of text nodes to collapse.
422    * @return The same list.
423    */
424   public static LinkedList<Object> collapseTextNodes(LinkedList<Object> l) {
425
426      String prev = null;
427      for (ListIterator<Object> i = l.listIterator(); i.hasNext();) {
428         Object o = i.next();
429         if (o instanceof String) {
430            if (prev == null)
431               prev = o.toString();
432            else {
433               prev += o;
434               i.remove();
435               i.previous();
436               i.remove();
437               i.add(prev);
438            }
439         } else {
440            prev = null;
441         }
442      }
443      return l;
444   }
445
446   //-----------------------------------------------------------------------------------------------------------------
447   // Other methods
448   //-----------------------------------------------------------------------------------------------------------------
449
450   // Returns true if the specified character can safely be used in XML text or an attribute.
451   private static final boolean isValidXmlCharacter(char c) {
452      return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD);
453   }
454
455   // Returns true if the string at the specified position is of the form "_x####_"
456   // where '#' are hexadecimal characters.
457   private static final boolean isEscapeSequence(String s, int i) {
458      return s.length() > i+6
459         && s.charAt(i) == '_'
460         && s.charAt(i+1) == 'x'
461         && isHexCharacter(s.charAt(i+2))
462         && isHexCharacter(s.charAt(i+3))
463         && isHexCharacter(s.charAt(i+4))
464         && isHexCharacter(s.charAt(i+5))
465         && s.charAt(i+6) == '_';
466   }
467
468   // Returns true if the character is a hexadecimal character
469   private static final boolean isHexCharacter(char c) {
470      return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F');
471   }
472
473   // Converts an integer to a hexadecimal string padded to 4 places.
474   private static final Writer appendPaddedHexChar(Writer out, int num) throws IOException {
475      out.append("_x");
476      for (char c : toHex4(num))
477         out.append(c);
478      return out.append('_');
479   }
480
481   /**
482    * Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations.
483    *
484    * <p>
485    * The annotations should be a child-to-parent ordering of annotations found on a class or method.
486    *
487    * @param xmls The list of <ja>@Xml</ja> annotations.
488    * @param schemas The list of <ja>@XmlSchema</ja> annotations.
489    * @return The namespace, or <jk>null</jk> if it couldn't be found.
490    */
491   public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) {
492
493      for (Xml xml : xmls) {
494         Namespace ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas);
495         if (ns != null)
496            return ns;
497      }
498
499      for (XmlSchema schema : schemas) {
500         Namespace ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas);
501         if (ns != null)
502            return ns;
503      }
504
505      return null;
506   }
507
508   private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) {
509
510      // If both prefix and namespace specified, use that Namespace mapping.
511      if (! (prefix.isEmpty() || ns.isEmpty()))
512         return Namespace.create(prefix, ns);
513
514      // If only prefix specified, need to search for namespaceURI.
515      if (! prefix.isEmpty()) {
516         if (xmls != null)
517            for (Xml xml2 : xmls)
518               if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty())
519                  return Namespace.create(prefix, xml2.namespace());
520         for (XmlSchema schema : schemas) {
521            if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty())
522               return Namespace.create(prefix, schema.namespace());
523            for (XmlNs xmlNs : schema.xmlNs())
524               if (xmlNs.prefix().equals(prefix))
525                  return Namespace.create(prefix, xmlNs.namespaceURI());
526         }
527         throw new BeanRuntimeException("Found @Xml.prefix annotation with no matching URI.  prefix='"+prefix+"'");
528      }
529
530      // If only namespaceURI specified, need to search for prefix.
531      if (! ns.isEmpty()) {
532         if (xmls != null)
533            for (Xml xml2 : xmls)
534               if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty())
535                  return Namespace.create(xml2.prefix(), ns);
536         for (XmlSchema schema : schemas) {
537            if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty())
538               return Namespace.create(schema.prefix(), ns);
539            for (XmlNs xmlNs : schema.xmlNs())
540               if (xmlNs.namespaceURI().equals(ns))
541                  return Namespace.create(xmlNs.prefix(), ns);
542         }
543      }
544
545      return null;
546   }
547
548   /**
549    * Utility method that converts the current event on the XML stream to something human-readable for debug purposes.
550    *
551    * @param r The XML stream reader whose current event is to be converted to a readable string.
552    * @return The event in human-readable form.
553    */
554   public static final String toReadableEvent(XMLStreamReader r) {
555      int t = r.getEventType();
556      if (t == 1)
557         return "<"+r.getLocalName()+">";
558      if (t == 2)
559         return "</"+r.getLocalName()+">";
560      if (t == 3)
561         return "PROCESSING_INSTRUCTION";
562      if (t == 4)
563         return "CHARACTERS=[" + r.getText() + "]";
564      if (t == 5)
565         return "COMMENTS=[" + r.getText() + "]";
566      if (t == 6)
567         return "SPACE=[" + r.getText() + "]";
568      if (t == 7)
569         return "START_DOCUMENT";
570      if (t == 8)
571         return "END_DOCUMENT";
572      if (t == 9)
573         return "ENTITY_REFERENCE";
574      if (t == 10)
575         return "ATTRIBUTE";
576      if (t == 11)
577         return "DTD";
578      if (t == 12)
579         return "CDATA=["+r.getText()+"]";
580      if (t == 13)
581         return "NAMESPACE";
582      if (t == 14)
583         return "NOTATION_DECLARATION";
584      if (t == 15)
585         return "ENTITY_DECLARATION";
586      return "UNKNOWN";
587   }
588}