001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.juneau.xml;
018
019import static org.apache.juneau.commons.utils.StringUtils.*;
020import static org.apache.juneau.commons.utils.ThrowableUtils.*;
021import static org.apache.juneau.commons.utils.Utils.*;
022
023import java.io.*;
024import java.util.*;
025
026import javax.xml.stream.*;
027
028import org.apache.juneau.commons.io.*;
029import org.apache.juneau.commons.lang.*;
030import org.apache.juneau.xml.annotation.*;
031
032/**
033 * XML utility methods.
034 *
035 * <h5 class='section'>See Also:</h5><ul>
036 *    <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/XmlBasics">XML Basics</a>
037
038 * </ul>
039 */
040@SuppressWarnings("resource")
041public class XmlUtils {
042
043   // @formatter:off
044   private static AsciiMap REPLACE_TEXT = new AsciiMap()
045      .append('&', "&amp;")
046      .append('<', "&lt;")
047      .append('>', "&gt;")
048      .append((char)0x09, "&#x0009;")
049      .append((char)0x0A, "&#x000a;")
050      .append((char)0x0D, "&#x000d;");
051
052   private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap()
053      .append('&', "&amp;")
054      .append('<', "&lt;")
055      .append('>', "&gt;")
056      .append('"', "&quot;")
057      .append('\'', "&apos;")
058      .append((char)0x09, "&#x0009;")
059      .append((char)0x0A, "&#x000a;")
060      .append((char)0x0D, "&#x000d;");
061   // @formatter:on
062
063   /**
064    * Given a list of Strings and other Objects, combines Strings that are next to each other in the list.
065    *
066    * @param value The list of text nodes to collapse.
067    * @return The same list.
068    */
069   public static LinkedList<Object> collapseTextNodes(LinkedList<Object> value) {
070
071      var prev = (String)null;
072      for (ListIterator<Object> i = value.listIterator(); i.hasNext();) {
073         Object o = i.next();
074         if (o instanceof String o2) {
075            if (prev == null)
076               prev = o2.toString();
077            else {
078               prev += o2;
079               i.remove();
080               i.previous();
081               i.remove();
082               i.add(prev);
083            }
084         } else {
085            prev = null;
086         }
087      }
088      return value;
089   }
090
091   /**
092    * Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters.
093    *
094    * @param value The string being decoded.
095    * @param sb The string builder to use as a scratch pad.
096    * @return The decoded string.
097    */
098   public static String decode(String value, StringBuilder sb) {
099      if (value == null)
100         return null;
101      if (value.isEmpty() || value.indexOf('_') == -1)
102         return value;
103      if (sb == null)
104         sb = new StringBuilder(value.length());
105
106      for (var i = 0; i < value.length(); i++) {
107         var c = value.charAt(i);
108         if (c == '_' && isEscapeSequence(value, i)) {
109
110            var x = Integer.parseInt(value.substring(i + 2, i + 6), 16);
111
112            // If we find _x0000_, then that means a null.
113            // If we find _xE000_, then that means an empty string.
114            if (x == 0)
115               return null;
116            else if (x != 0xE000)
117               sb.append((char)x);
118
119            i += 6;
120         } else {
121            sb.append(c);
122         }
123      }
124      return sb.toString();
125   }
126
127   /**
128    * Serializes and encodes the specified object as valid XML attribute name.
129    *
130    * @param w The writer to send the output to.
131    * @param value The object being serialized.
132    * @return This object.
133    * @throws IOException If a problem occurred.
134    */
135   public static Writer encodeAttrName(Writer w, Object value) throws IOException {
136      if (value == null)
137         return w.append("_x0000_");
138      var s = value.toString();
139
140      if (needsAttrNameEncoding(s)) {
141         for (var i = 0; i < s.length(); i++) {
142            var c = s.charAt(i);
143            if (i == 0) {
144               if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')
145                  w.append(c);
146               else if (c == '_' && ! isEscapeSequence(s, i))
147                  w.append(c);
148               else
149                  appendPaddedHexChar(w, c);
150            } else {
151               if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':'))
152                  w.append(c);
153               else if (c == '_' && ! isEscapeSequence(s, i))
154                  w.append(c);
155               else
156                  appendPaddedHexChar(w, c);
157            }
158         }
159      } else {
160         w.append(s);
161      }
162
163      return w;
164   }
165
166   /**
167    * Encodes the specified attribute value and sends the results to the specified writer.
168    *
169    * <p>
170    * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified
171    * writer.
172    * <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, <js>'&gt;'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities.
173    * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences.
174    *
175    * @param w The writer to send the output to.
176    * @param value The object being encoded.
177    * @param trim
178    *    Trim the text before serializing it.
179    *    If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
180    * @return The same writer passed in.
181    */
182   public static Writer encodeAttrValue(Writer w, Object value, boolean trim) {
183      try {
184         if (value == null)
185            return w.append("_x0000_");
186         var s = value.toString();
187         if (s.isEmpty())
188            return w;
189         if (trim)
190            s = s.trim();
191
192         if (needsAttrValueEncoding(s)) {
193            var len = s.length();
194            for (var i = 0; i < len; i++) {
195               var c = s.charAt(i);
196               if ((i == 0 || i == len - 1) && Character.isWhitespace(c))
197                  appendPaddedHexChar(w, c);
198               else if (REPLACE_ATTR_VAL.contains(c))
199                  w.append(REPLACE_ATTR_VAL.get(c));
200               else if (c == '_' && isEscapeSequence(s, i))
201                  appendPaddedHexChar(w, c);
202               else if (isValidXmlCharacter(c))
203                  w.append(c);
204               else
205                  appendPaddedHexChar(w, c);
206            }
207         } else {
208            w.append(s);
209         }
210      } catch (IOException e) {
211         throw toRex(e);
212      }
213
214      return w;
215   }
216
217   /**
218    * Encodes any invalid XML element name characters to <c>_x####_</c> sequences.
219    *
220    * @param value The object being encoded.
221    * @return The encoded element name string.
222    */
223   public static String encodeElementName(Object value) {
224      if (value == null)
225         return "_x0000_";
226      var s = value.toString();
227      if (s.isEmpty())
228         return "_xE000_";
229
230      try {
231         if (needsElementNameEncoding(s))
232            try (var w = new StringBuilderWriter(s.length() * 2)) {
233               return encodeElementNameInner(w, s).toString();
234            }
235      } catch (IOException e) {
236         throw toRex(e); // Never happens
237      }
238
239      return s;
240   }
241
242   /**
243    * Encodes any invalid XML element name characters to <c>_x####_</c> sequences.
244    *
245    * @param w The writer to send the output to.
246    * @param value The object being encoded.
247    * @return The same writer passed in.
248    */
249   public static Writer encodeElementName(Writer w, Object value) {
250      try {
251         if (value == null)
252            return w.append("_x0000_");
253         var s = value.toString();
254         if (needsElementNameEncoding(s))
255            return encodeElementNameInner(w, s);
256         w.append(s);
257      } catch (IOException e) {
258         throw toRex(e);
259      }
260      return w;
261   }
262
263   /**
264    * Encodes the specified element text and sends the results to the specified writer.
265    *
266    * <p>
267    * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified
268    * writer.
269    * <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, and <js>'&gt;'</js> as XML entities.
270    * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences.
271    *
272    * @param w The writer to send the output to.
273    * @param value The object being encoded.
274    * @param trim Trim the text before serializing it.
275    * @param preserveWhitespace
276    *    Specifies whether we're in preserve-whitespace mode.
277    *    (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}.
278    *    If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
279    * @return The same writer passed in.
280    */
281   public static Writer encodeText(Writer w, Object value, boolean trim, boolean preserveWhitespace) {
282
283      try {
284         if (value == null)
285            return w.append("_x0000_");
286         var s = value.toString();
287         if (s.isEmpty())
288            return w.append("_xE000_");
289         if (trim)
290            s = s.trim();
291
292         if (needsTextEncoding(s)) {
293            var len = s.length();
294            for (var i = 0; i < len; i++) {
295               var c = s.charAt(i);
296               if ((i == 0 || i == len - 1) && Character.isWhitespace(c) && ! preserveWhitespace)
297                  appendPaddedHexChar(w, c);
298               else if (REPLACE_TEXT.contains(c))
299                  w.append(REPLACE_TEXT.get(c));
300               else if (c == '_' && isEscapeSequence(s, i))
301                  appendPaddedHexChar(w, c);
302               else if (isValidXmlCharacter(c))
303                  w.append(c);
304               else
305                  appendPaddedHexChar(w, c);
306            }
307         } else {
308            w.append(s);
309         }
310      } catch (IOException e) {
311         throw toRex(e);
312      }
313
314      return w;
315   }
316
317   /**
318    * Escapes invalid XML text characters to <c>_x####_</c> sequences.
319    *
320    * @param value The object being encoded.
321    * @return The encoded string.
322    */
323   public static String escapeText(Object value) {
324      if (value == null)
325         return "_x0000_";
326      var s = value.toString();
327
328      try {
329         if (! needsTextEncoding(s))
330            return s;
331         var len = s.length();
332         var sw = new StringWriter(s.length() * 2);
333         for (var i = 0; i < len; i++) {
334            var c = s.charAt(i);
335            if ((i == 0 || i == len - 1) && Character.isWhitespace(c))
336               appendPaddedHexChar(sw, c);
337            else if (c == '_' && isEscapeSequence(s, i))
338               appendPaddedHexChar(sw, c);
339            else if (isValidXmlCharacter(c))
340               sw.append(c);
341            else
342               appendPaddedHexChar(sw, c);
343         }
344         return sw.toString();
345      } catch (IOException e) {
346         throw toRex(e); // Never happens
347      }
348   }
349
350   /**
351    * Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations.
352    *
353    * <p>
354    * The annotations should be a parent-to-child ordering of annotations found on a class or method.
355    *
356    * @param xmls The list of <ja>@Xml</ja> annotations.
357    * @param schemas The list of <ja>@XmlSchema</ja> annotations.
358    * @return The namespace, or <jk>null</jk> if it couldn't be found.
359    */
360   public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) {
361
362      for (var i = xmls.size() - 1; i >= 0; i--) {
363         var xml = xmls.get(i);
364         var ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas);
365         if (nn(ns))
366            return ns;
367      }
368
369      for (var i = schemas.size() - 1; i >= 0; i--) {
370         var schema = schemas.get(i);
371         var ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas);
372         if (nn(ns))
373            return ns;
374      }
375
376      return null;
377   }
378
379   /**
380    * Utility method that converts the current event on the XML stream to something human-readable for debug purposes.
381    *
382    * @param r The XML stream reader whose current event is to be converted to a readable string.
383    * @return The event in human-readable form.
384    */
385   public static String toReadableEvent(XMLStreamReader r) {
386      int t = r.getEventType();
387      if (t == 1)
388         return "<" + r.getLocalName() + ">";
389      if (t == 2)
390         return "</" + r.getLocalName() + ">";
391      if (t == 3)
392         return "PROCESSING_INSTRUCTION";
393      if (t == 4)
394         return "CHARACTERS=[" + r.getText() + "]";
395      if (t == 5)
396         return "COMMENTS=[" + r.getText() + "]";
397      if (t == 6)
398         return "SPACE=[" + r.getText() + "]";
399      if (t == 7)
400         return "START_DOCUMENT";
401      if (t == 8)
402         return "END_DOCUMENT";
403      if (t == 9)
404         return "ENTITY_REFERENCE";
405      if (t == 10)
406         return "ATTRIBUTE";
407      if (t == 11)
408         return "DTD";
409      if (t == 12)
410         return "CDATA=[" + r.getText() + "]";
411      if (t == 13)
412         return "NAMESPACE";
413      if (t == 14)
414         return "NOTATION_DECLARATION";
415      if (t == 15)
416         return "ENTITY_DECLARATION";
417      return "UNKNOWN";
418   }
419
420   // Converts an integer to a hexadecimal string padded to 4 places.
421   private static Writer appendPaddedHexChar(Writer out, int num) throws IOException {
422      out.append("_x");
423      for (var c : toHex4(num))
424         out.append(c);
425      return out.append('_');
426   }
427
428   private static Writer encodeElementNameInner(Writer w, String s) throws IOException {
429      for (var i = 0; i < s.length(); i++) {
430         var c = s.charAt(i);
431         // @formatter:off
432         if ((c >= 'A' && c <= 'Z')
433               || (c == '_' && ! isEscapeSequence(s,i))
434               || (c >= 'a' && c <= 'z')
435               || (i != 0 && (
436                     c == '-'
437                     || c == '.'
438                     || (c >= '0' && c <= '9')
439                     || c == '\u00b7'
440                     || (c >= '\u0300' && c <= '\u036f')
441                     || (c >= '\u203f' && c <= '\u2040')
442                  ))
443               || (c >= '\u00c0' && c <= '\u00d6')
444               || (c >= '\u00d8' && c <= '\u00f6')
445               || (c >= '\u00f8' && c <= '\u02ff')
446               || (c >= '\u0370' && c <= '\u037d')
447               || (c >= '\u037f' && c <= '\u1fff')
448               || (c >= '\u200c' && c <= '\u200d')
449               || (c >= '\u2070' && c <= '\u218f')
450               || (c >= '\u2c00' && c <= '\u2fef')
451               || (c >= '\u3001' && c <= '\ud7ff')
452               || (c >= '\uf900' && c <= '\ufdcf')
453               || (c >= '\ufdf0' && c <= '\ufffd')) {
454            // @formatter:on
455            w.append(c);
456         } else {
457            appendPaddedHexChar(w, c);
458         }
459      }
460      return w;
461   }
462
463   private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) {
464
465      // If both prefix and namespace specified, use that Namespace mapping.
466      if (! (prefix.isEmpty() || ns.isEmpty()))
467         return Namespace.of(prefix, ns);
468
469      // If only prefix specified, need to search for namespaceURI.
470      if (! prefix.isEmpty()) {
471         if (nn(xmls))
472            for (var xml2 : xmls)
473               if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty())
474                  return Namespace.of(prefix, xml2.namespace());
475         for (var schema : schemas) {
476            if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty())
477               return Namespace.of(prefix, schema.namespace());
478            for (var xmlNs : schema.xmlNs())
479               if (xmlNs.prefix().equals(prefix))
480                  return Namespace.of(prefix, xmlNs.namespaceURI());
481         }
482         throw bex("Found @Xml.prefix annotation with no matching URI.  prefix=''{0}''", prefix);
483      }
484
485      // If only namespaceURI specified, need to search for prefix.
486      if (! ns.isEmpty()) {
487         if (nn(xmls))
488            for (var xml2 : xmls)
489               if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty())
490                  return Namespace.of(xml2.prefix(), ns);
491         for (var schema : schemas) {
492            if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty())
493               return Namespace.of(schema.prefix(), ns);
494            for (var xmlNs : schema.xmlNs())
495               if (xmlNs.namespaceURI().equals(ns))
496                  return Namespace.of(xmlNs.prefix(), ns);
497         }
498      }
499
500      return null;
501   }
502
503   // Returns true if the string at the specified position is of the form "_x####_"
504   // where '#' are hexadecimal characters.
505   private static boolean isEscapeSequence(String s, int i) {
506      // @formatter:off
507      return s.length() > i+6
508         && s.charAt(i) == '_'
509         && s.charAt(i+1) == 'x'
510         && isHexCharacter(s.charAt(i+2))
511         && isHexCharacter(s.charAt(i+3))
512         && isHexCharacter(s.charAt(i+4))
513         && isHexCharacter(s.charAt(i+5))
514         && s.charAt(i+6) == '_';
515      // @formatter:on
516   }
517
518   // Returns true if the character is a hexadecimal character
519   private static boolean isHexCharacter(char c) {
520      return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F');
521   }
522
523   // Returns true if the specified character can safely be used in XML text or an attribute.
524   private static boolean isValidXmlCharacter(char c) {
525      return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD);
526   }
527
528   private static boolean needsAttrNameEncoding(String value) {
529      // Note that this doesn't need to be perfect, just fast.
530      for (var i = 0; i < value.length(); i++) {
531         var c = value.charAt(i);
532         if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')))
533            return true;
534      }
535      return false;
536   }
537
538   private static boolean needsAttrValueEncoding(String value) {
539      // See if we need to convert the string.
540      // Conversion is somewhat expensive, so make sure we need to do so before hand.
541      var len = value.length();
542      for (var i = 0; i < len; i++) {
543         var c = value.charAt(i);
544         if ((i == 0 || i == len - 1) && Character.isWhitespace(c))
545            return true;
546         if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value, i)))
547            return true;
548      }
549      return false;
550   }
551
552   private static boolean needsElementNameEncoding(String value) {
553      // Note that this doesn't need to be perfect, just fast.
554      for (var i = 0; i < value.length(); i++) {
555         var c = value.charAt(i);
556         if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && (c >= '0' && c <= '9')))
557            return true;
558      }
559      return false;
560   }
561
562   private static boolean needsTextEncoding(String value) {
563      // See if we need to convert the string.
564      // Conversion is somewhat expensive, so make sure we need to do so before hand.
565      var len = value.length();
566      for (var i = 0; i < len; i++) {
567         var c = value.charAt(i);
568         if ((i == 0 || i == len - 1) && Character.isWhitespace(c))
569            return true;
570         if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value, i)))
571            return true;
572      }
573      return false;
574   }
575}