001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.juneau.xml;
018
019import static org.apache.juneau.common.utils.StringUtils.*;
020import static org.apache.juneau.common.utils.ThrowableUtils.*;
021
022import java.io.*;
023import java.util.*;
024
025import javax.xml.stream.*;
026
027import org.apache.juneau.*;
028import org.apache.juneau.common.utils.*;
029import org.apache.juneau.internal.*;
030import org.apache.juneau.xml.annotation.*;
031
032/**
033 * XML utility methods.
034 *
035 * <h5 class='section'>See Also:</h5><ul>
036 *    <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/XmlBasics">XML Basics</a>
037
038 * </ul>
039 */
040public class XmlUtils {
041
042   //-----------------------------------------------------------------------------------------------------------------
043   // XML element names
044   //-----------------------------------------------------------------------------------------------------------------
045
046   /**
047    * Encodes any invalid XML element name characters to <c>_x####_</c> sequences.
048    *
049    * @param w The writer to send the output to.
050    * @param value The object being encoded.
051    * @return The same writer passed in.
052    */
053   public static Writer encodeElementName(Writer w, Object value) {
054      try {
055         if (value == null)
056            return w.append("_x0000_");
057         String s = value.toString();
058         if (needsElementNameEncoding(s))
059            return encodeElementNameInner(w, s);
060         w.append(s);
061      } catch (IOException e) {
062         throw asRuntimeException(e);
063      }
064      return w;
065   }
066
067   /**
068    * Encodes any invalid XML element name characters to <c>_x####_</c> sequences.
069    *
070    * @param value The object being encoded.
071    * @return The encoded element name string.
072    */
073   public static String encodeElementName(Object value) {
074      if (value == null)
075         return "_x0000_";
076      String s = value.toString();
077      if (s.isEmpty())
078         return "_xE000_";
079
080      try {
081         if (needsElementNameEncoding(s))
082            try (Writer w = new StringBuilderWriter(s.length() * 2)) {
083               return encodeElementNameInner(w, s).toString();
084            }
085      } catch (IOException e) {
086         throw asRuntimeException(e); // Never happens
087      }
088
089      return s;
090   }
091
092   private static Writer encodeElementNameInner(Writer w, String s) throws IOException {
093      for (int i = 0; i < s.length(); i++) {
094         char c = s.charAt(i);
095         if ((c >= 'A' && c <= 'Z')
096               || (c == '_' && ! isEscapeSequence(s,i))
097               || (c >= 'a' && c <= 'z')
098               || (i != 0 && (
099                     c == '-'
100                     || c == '.'
101                     || (c >= '0' && c <= '9')
102                     || c == '\u00b7'
103                     || (c >= '\u0300' && c <= '\u036f')
104                     || (c >= '\u203f' && c <= '\u2040')
105                  ))
106               || (c >= '\u00c0' && c <= '\u00d6')
107               || (c >= '\u00d8' && c <= '\u00f6')
108               || (c >= '\u00f8' && c <= '\u02ff')
109               || (c >= '\u0370' && c <= '\u037d')
110               || (c >= '\u037f' && c <= '\u1fff')
111               || (c >= '\u200c' && c <= '\u200d')
112               || (c >= '\u2070' && c <= '\u218f')
113               || (c >= '\u2c00' && c <= '\u2fef')
114               || (c >= '\u3001' && c <= '\ud7ff')
115               || (c >= '\uf900' && c <= '\ufdcf')
116               || (c >= '\ufdf0' && c <= '\ufffd')) {
117            w.append(c);
118         }  else {
119            appendPaddedHexChar(w, c);
120         }
121      }
122      return w;
123   }
124
125   private static boolean needsElementNameEncoding(String value) {
126      // Note that this doesn't need to be perfect, just fast.
127      for (int i = 0; i < value.length(); i++) {
128         char c = value.charAt(i);
129         if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && (c >= '0' && c <= '9')))
130            return true;
131      }
132      return false;
133   }
134
135   //-----------------------------------------------------------------------------------------------------------------
136   // XML element text
137   //-----------------------------------------------------------------------------------------------------------------
138
139   /**
140    * Escapes invalid XML text characters to <c>_x####_</c> sequences.
141    *
142    * @param value The object being encoded.
143    * @return The encoded string.
144    */
145   public static String escapeText(Object value) {
146      if (value == null)
147         return "_x0000_";
148      String s = value.toString();
149
150      try {
151         if (! needsTextEncoding(s))
152            return s;
153         final int len = s.length();
154         StringWriter sw = new StringWriter(s.length()*2);
155         for (int i = 0; i < len; i++) {
156            char c = s.charAt(i);
157            if ((i == 0 || i == len-1) && Character.isWhitespace(c))
158               appendPaddedHexChar(sw, c);
159            else if (c == '_' && isEscapeSequence(s,i))
160               appendPaddedHexChar(sw, c);
161            else if (isValidXmlCharacter(c))
162               sw.append(c);
163            else
164               appendPaddedHexChar(sw, c);
165         }
166         return sw.toString();
167      } catch (IOException e) {
168         throw asRuntimeException(e); // Never happens
169      }
170   }
171
172   /**
173    * Encodes the specified element text and sends the results to the specified writer.
174    *
175    * <p>
176    * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified
177    * writer.
178    * <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, and <js>'&gt;'</js> as XML entities.
179    * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences.
180    *
181    * @param w The writer to send the output to.
182    * @param value The object being encoded.
183    * @param trim Trim the text before serializing it.
184    * @param preserveWhitespace
185    *    Specifies whether we're in preserve-whitespace mode.
186    *    (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}.
187    *    If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
188    * @return The same writer passed in.
189    */
190   public static Writer encodeText(Writer w, Object value, boolean trim, boolean preserveWhitespace) {
191
192      try {
193         if (value == null)
194            return w.append("_x0000_");
195         String s = value.toString();
196         if (s.isEmpty())
197            return w.append("_xE000_");
198         if (trim)
199            s = s.trim();
200
201         if (needsTextEncoding(s)) {
202            final int len = s.length();
203            for (int i = 0; i < len; i++) {
204               char c = s.charAt(i);
205               if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace)
206                  appendPaddedHexChar(w, c);
207               else if (REPLACE_TEXT.contains(c))
208                  w.append(REPLACE_TEXT.get(c));
209               else if (c == '_' && isEscapeSequence(s,i))
210                  appendPaddedHexChar(w, c);
211               else if (isValidXmlCharacter(c))
212                  w.append(c);
213               else
214                  appendPaddedHexChar(w, c);
215            }
216         } else {
217            w.append(s);
218         }
219      } catch (IOException e) {
220         throw asRuntimeException(e);
221      }
222
223      return w;
224   }
225
226   private static boolean needsTextEncoding(String value) {
227      // See if we need to convert the string.
228      // Conversion is somewhat expensive, so make sure we need to do so before hand.
229      final int len = value.length();
230      for (int i = 0; i < len; i++) {
231         char c = value.charAt(i);
232         if ((i == 0 || i == len-1) && Character.isWhitespace(c))
233            return true;
234         if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value,i)))
235            return true;
236      }
237      return false;
238   }
239
240   private static AsciiMap REPLACE_TEXT = new AsciiMap()
241      .append('&', "&amp;")
242      .append('<', "&lt;")
243      .append('>', "&gt;")
244      .append((char)0x09, "&#x0009;")
245      .append((char)0x0A, "&#x000a;")
246      .append((char)0x0D, "&#x000d;");
247
248
249   //-----------------------------------------------------------------------------------------------------------------
250   // XML attribute names
251   //-----------------------------------------------------------------------------------------------------------------
252
253   /**
254    * Serializes and encodes the specified object as valid XML attribute name.
255    *
256    * @param w The writer to send the output to.
257    * @param value The object being serialized.
258    * @return This object.
259    * @throws IOException If a problem occurred.
260    */
261   public static Writer encodeAttrName(Writer w, Object value) throws IOException {
262      if (value == null)
263         return w.append("_x0000_");
264      String s = value.toString();
265
266      if (needsAttrNameEncoding(s)) {
267         for (int i = 0; i < s.length(); i++) {
268            char c = s.charAt(i);
269            if (i == 0) {
270               if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')
271                  w.append(c);
272               else if (c == '_' && ! isEscapeSequence(s,i))
273                  w.append(c);
274               else
275                  appendPaddedHexChar(w, c);
276            } else {
277               if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':'))
278                  w.append(c);
279               else if (c == '_' && ! isEscapeSequence(s,i))
280                  w.append(c);
281               else
282                  appendPaddedHexChar(w, c);
283            }
284         }
285      } else {
286         w.append(s);
287      }
288
289      return w;
290   }
291
292   private static boolean needsAttrNameEncoding(String value) {
293      // Note that this doesn't need to be perfect, just fast.
294      for (int i = 0; i < value.length(); i++) {
295         char c = value.charAt(i);
296         if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')))
297            return true;
298      }
299      return false;
300   }
301
302   //-----------------------------------------------------------------------------------------------------------------
303   // XML attribute values
304   //-----------------------------------------------------------------------------------------------------------------
305
306   /**
307    * Encodes the specified attribute value and sends the results to the specified writer.
308    *
309    * <p>
310    * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified
311    * writer.
312    * <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, <js>'&gt;'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities.
313    * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences.
314    *
315    * @param w The writer to send the output to.
316    * @param value The object being encoded.
317    * @param trim
318    *    Trim the text before serializing it.
319    *    If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
320    * @return The same writer passed in.
321    */
322   public static Writer encodeAttrValue(Writer w, Object value, boolean trim) {
323      try {
324         if (value == null)
325            return w.append("_x0000_");
326         String s = value.toString();
327         if (s.isEmpty())
328            return w;
329         if (trim)
330            s = s.trim();
331
332         if (needsAttrValueEncoding(s)) {
333            final int len = s.length();
334            for (int i = 0; i < len; i++) {
335               char c = s.charAt(i);
336               if ((i == 0 || i == len-1) && Character.isWhitespace(c))
337                  appendPaddedHexChar(w, c);
338               else if (REPLACE_ATTR_VAL.contains(c))
339                  w.append(REPLACE_ATTR_VAL.get(c));
340               else if (c == '_' && isEscapeSequence(s,i))
341                  appendPaddedHexChar(w, c);
342               else if (isValidXmlCharacter(c))
343                  w.append(c);
344               else
345                  appendPaddedHexChar(w, c);
346            }
347         } else {
348            w.append(s);
349         }
350      } catch (IOException e) {
351         throw asRuntimeException(e);
352      }
353
354      return w;
355   }
356
357   private static boolean needsAttrValueEncoding(String value) {
358      // See if we need to convert the string.
359      // Conversion is somewhat expensive, so make sure we need to do so before hand.
360      final int len = value.length();
361      for (int i = 0; i < len; i++) {
362         char c = value.charAt(i);
363         if ((i == 0 || i == len-1) && Character.isWhitespace(c))
364            return true;
365         if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value,i)))
366            return true;
367      }
368      return false;
369   }
370
371   private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap()
372      .append('&', "&amp;")
373      .append('<', "&lt;")
374      .append('>', "&gt;")
375      .append('"', "&quot;")
376      .append('\'', "&apos;")
377      .append((char)0x09, "&#x0009;")
378      .append((char)0x0A, "&#x000a;")
379      .append((char)0x0D, "&#x000d;");
380
381
382   //-----------------------------------------------------------------------------------------------------------------
383   // Decode XML text
384   //-----------------------------------------------------------------------------------------------------------------
385
386   /**
387    * Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters.
388    *
389    * @param value The string being decoded.
390    * @param sb The string builder to use as a scratch pad.
391    * @return The decoded string.
392    */
393   public static String decode(String value, StringBuilder sb) {
394      if (value == null)
395         return null;
396      if (value.isEmpty() || value.indexOf('_') == -1)
397         return value;
398      if (sb == null)
399         sb = new StringBuilder(value.length());
400
401      for (int i = 0; i < value.length(); i++) {
402         char c = value.charAt(i);
403         if (c == '_' && isEscapeSequence(value,i)) {
404
405            int x = Integer.parseInt(value.substring(i+2, i+6), 16);
406
407            // If we find _x0000_, then that means a null.
408            // If we find _xE000_, then that means an empty string.
409            if (x == 0)
410               return null;
411            else if (x != 0xE000)
412               sb.append((char)x);
413
414            i+=6;
415         } else {
416            sb.append(c);
417         }
418      }
419      return sb.toString();
420   }
421
422
423   /**
424    * Given a list of Strings and other Objects, combines Strings that are next to each other in the list.
425    *
426    * @param value The list of text nodes to collapse.
427    * @return The same list.
428    */
429   public static LinkedList<Object> collapseTextNodes(LinkedList<Object> value) {
430
431      String prev = null;
432      for (ListIterator<Object> i = value.listIterator(); i.hasNext();) {
433         Object o = i.next();
434         if (o instanceof String) {
435            if (prev == null)
436               prev = o.toString();
437            else {
438               prev += o;
439               i.remove();
440               i.previous();
441               i.remove();
442               i.add(prev);
443            }
444         } else {
445            prev = null;
446         }
447      }
448      return value;
449   }
450
451   //-----------------------------------------------------------------------------------------------------------------
452   // Other methods
453   //-----------------------------------------------------------------------------------------------------------------
454
455   // Returns true if the specified character can safely be used in XML text or an attribute.
456   private static boolean isValidXmlCharacter(char c) {
457      return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD);
458   }
459
460   // Returns true if the string at the specified position is of the form "_x####_"
461   // where '#' are hexadecimal characters.
462   private static boolean isEscapeSequence(String s, int i) {
463      return s.length() > i+6
464         && s.charAt(i) == '_'
465         && s.charAt(i+1) == 'x'
466         && isHexCharacter(s.charAt(i+2))
467         && isHexCharacter(s.charAt(i+3))
468         && isHexCharacter(s.charAt(i+4))
469         && isHexCharacter(s.charAt(i+5))
470         && s.charAt(i+6) == '_';
471   }
472
473   // Returns true if the character is a hexadecimal character
474   private static boolean isHexCharacter(char c) {
475      return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F');
476   }
477
478   // Converts an integer to a hexadecimal string padded to 4 places.
479   private static Writer appendPaddedHexChar(Writer out, int num) throws IOException {
480      out.append("_x");
481      for (char c : toHex4(num))
482         out.append(c);
483      return out.append('_');
484   }
485
486   /**
487    * Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations.
488    *
489    * <p>
490    * The annotations should be a parent-to-child ordering of annotations found on a class or method.
491    *
492    * @param xmls The list of <ja>@Xml</ja> annotations.
493    * @param schemas The list of <ja>@XmlSchema</ja> annotations.
494    * @return The namespace, or <jk>null</jk> if it couldn't be found.
495    */
496   public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) {
497
498      for (int i = xmls.size()-1; i >= 0; i--) {
499         Xml xml = xmls.get(i);
500         Namespace ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas);
501         if (ns != null)
502            return ns;
503      }
504
505      for (int i = schemas.size()-1; i >= 0; i--) {
506         XmlSchema schema = schemas.get(i);
507         Namespace ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas);
508         if (ns != null)
509            return ns;
510      }
511
512      return null;
513   }
514
515   private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) {
516
517      // If both prefix and namespace specified, use that Namespace mapping.
518      if (! (prefix.isEmpty() || ns.isEmpty()))
519         return Namespace.of(prefix, ns);
520
521      // If only prefix specified, need to search for namespaceURI.
522      if (! prefix.isEmpty()) {
523         if (xmls != null)
524            for (Xml xml2 : xmls)
525               if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty())
526                  return Namespace.of(prefix, xml2.namespace());
527         for (XmlSchema schema : schemas) {
528            if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty())
529               return Namespace.of(prefix, schema.namespace());
530            for (XmlNs xmlNs : schema.xmlNs())
531               if (xmlNs.prefix().equals(prefix))
532                  return Namespace.of(prefix, xmlNs.namespaceURI());
533         }
534         throw new BeanRuntimeException("Found @Xml.prefix annotation with no matching URI.  prefix='"+prefix+"'");
535      }
536
537      // If only namespaceURI specified, need to search for prefix.
538      if (! ns.isEmpty()) {
539         if (xmls != null)
540            for (Xml xml2 : xmls)
541               if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty())
542                  return Namespace.of(xml2.prefix(), ns);
543         for (XmlSchema schema : schemas) {
544            if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty())
545               return Namespace.of(schema.prefix(), ns);
546            for (XmlNs xmlNs : schema.xmlNs())
547               if (xmlNs.namespaceURI().equals(ns))
548                  return Namespace.of(xmlNs.prefix(), ns);
549         }
550      }
551
552      return null;
553   }
554
555   /**
556    * Utility method that converts the current event on the XML stream to something human-readable for debug purposes.
557    *
558    * @param r The XML stream reader whose current event is to be converted to a readable string.
559    * @return The event in human-readable form.
560    */
561   public static String toReadableEvent(XMLStreamReader r) {
562      int t = r.getEventType();
563      if (t == 1)
564         return "<"+r.getLocalName()+">";
565      if (t == 2)
566         return "</"+r.getLocalName()+">";
567      if (t == 3)
568         return "PROCESSING_INSTRUCTION";
569      if (t == 4)
570         return "CHARACTERS=[" + r.getText() + "]";
571      if (t == 5)
572         return "COMMENTS=[" + r.getText() + "]";
573      if (t == 6)
574         return "SPACE=[" + r.getText() + "]";
575      if (t == 7)
576         return "START_DOCUMENT";
577      if (t == 8)
578         return "END_DOCUMENT";
579      if (t == 9)
580         return "ENTITY_REFERENCE";
581      if (t == 10)
582         return "ATTRIBUTE";
583      if (t == 11)
584         return "DTD";
585      if (t == 12)
586         return "CDATA=["+r.getText()+"]";
587      if (t == 13)
588         return "NAMESPACE";
589      if (t == 14)
590         return "NOTATION_DECLARATION";
591      if (t == 15)
592         return "ENTITY_DECLARATION";
593      return "UNKNOWN";
594   }
595}