001// ***************************************************************************************************************************
002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
003// * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
005// * with the License.  You may obtain a copy of the License at                                                              *
006// *                                                                                                                         *
007// *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
008// *                                                                                                                         *
009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
011// * specific language governing permissions and limitations under the License.                                              *
012// ***************************************************************************************************************************
013package org.apache.juneau.uon;
014
015import java.io.*;
016
017import org.apache.juneau.parser.*;
018
019/**
020 * Same functionality as {@link ParserReader} except automatically decoded <c>%xx</c> escape sequences.
021 *
022 * <p>
023 * Escape sequences are assumed to be encoded UTF-8.  Extended Unicode (&gt;\u10000) is supported.
024 *
025 * <p>
026 * If decoding is enabled, the following character replacements occur so that boundaries are not lost:
027 * <ul>
028 *    <li><js>'&amp;'</js> -&gt; <js>'\u0001'</js>
029 *    <li><js>'='</js> -&gt; <js>'\u0002'</js>
030 * </ul>
031 */
032public final class UonReader extends ParserReader {
033
034   private final boolean decodeChars;
035   private final char[] buff;
036
037   // Writable properties.
038   private int iCurrent, iEnd;
039
040
041   /**
042    * Constructor.
043    *
044    * @param pipe The parser input.
045    * @param decodeChars Whether the input is URL-encoded.
046    * @throws IOException Thrown by underlying stream.
047    */
048   public UonReader(ParserPipe pipe, boolean decodeChars) throws IOException {
049      super(pipe);
050      this.decodeChars = decodeChars;
051      if (pipe.isString()) {
052         String in = pipe.getInputAsString();
053         this.buff = new char[in.length() < 1024 ? in.length() : 1024];
054      } else {
055         this.buff = new char[1024];
056      }
057   }
058
059   @Override /* Reader */
060   public final int read(char[] cbuf, int off, int len) throws IOException {
061
062      if (! decodeChars)
063         return super.read(cbuf, off, len);
064
065      // Copy any remainder to the beginning of the buffer.
066      int remainder = iEnd - iCurrent;
067      if (remainder > 0)
068         System.arraycopy(buff, iCurrent, buff, 0, remainder);
069      iCurrent = 0;
070
071      int expected = buff.length - remainder;
072
073      int x = super.read(buff, remainder, expected);
074      if (x == -1 && remainder == 0)
075         return -1;
076
077      iEnd = remainder + (x == -1 ? 0 : x);
078
079      int i = 0;
080      while (i < len) {
081         if (iCurrent >= iEnd)
082            return i;
083         char c = buff[iCurrent++];
084         if (c == '+') {
085            cbuf[off + i++] = ' ';
086         } else if (c == '&') {
087            cbuf[off + i++] = '\u0001';
088         } else if (c == '=') {
089            cbuf[off + i++] = '\u0002';
090         } else if (c != '%') {
091            cbuf[off + i++] = c;
092         } else {
093            int iMark = iCurrent-1;  // Keep track of current position.
094
095            // Stop if there aren't at least two more characters following '%' in the buffer,
096            // or there aren't at least two more positions open in cbuf to handle double-char chars.
097            if (iMark+2 >= iEnd || i+2 > len) {
098               iCurrent--;
099               return i;
100            }
101
102            int b0 = readEncodedByte();
103            int cx;
104
105            // 0xxxxxxx
106            if (b0 < 128) {
107               cx = b0;
108
109            // 10xxxxxx
110            } else if (b0 < 192) {
111               throw new IOException("Invalid hex value for first escape pattern in UTF-8 sequence:  " + b0);
112
113            // 110xxxxx 10xxxxxx
114            // 11000000(192) - 11011111(223)
115            } else if (b0 < 224) {
116               cx = readUTF8(b0-192, 1);
117               if (cx == -1) {
118                  iCurrent = iMark;
119                  return i;
120               }
121
122            // 1110xxxx 10xxxxxx 10xxxxxx
123            // 11100000(224) - 11101111(239)
124            } else if (b0 < 240) {
125               cx = readUTF8(b0-224, 2);
126               if (cx == -1) {
127                  iCurrent = iMark;
128                  return i;
129               }
130
131            // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
132            // 11110000(240) - 11110111(247)
133            } else if (b0 < 248) {
134               cx = readUTF8(b0-240, 3);
135               if (cx == -1) {
136                  iCurrent = iMark;
137                  return i;
138               }
139
140            } else
141               throw new IOException("Invalid hex value for first escape pattern in UTF-8 sequence:  " + b0);
142
143            if (cx < 0x10000)
144               cbuf[off + i++] = (char)cx;
145            else {
146               cx -= 0x10000;
147               cbuf[off + i++] = (char)(0xd800 + (cx >> 10));
148               cbuf[off + i++] = (char)(0xdc00 + (cx & 0x3ff));
149            }
150         }
151      }
152      return i;
153   }
154
155   private int readUTF8(int n, final int numBytes) throws IOException {
156      if (iCurrent + numBytes*3 > iEnd)
157         return -1;
158      for (int i = 0; i < numBytes; i++) {
159         n <<= 6;
160         n += readHex()-128;
161      }
162      return n;
163   }
164
165   private int readHex() throws IOException {
166      int c = buff[iCurrent++];
167      if (c != '%')
168         throw new IOException("Did not find expected '%' character in UTF-8 sequence.");
169      return readEncodedByte();
170   }
171
172   private int readEncodedByte() throws IOException {
173      if (iEnd <= iCurrent + 1)
174         throw new IOException("Incomplete trailing escape pattern");
175      int h = buff[iCurrent++];
176      int l = buff[iCurrent++];
177      h = fromHexChar(h);
178      l = fromHexChar(l);
179      return (h << 4) + l;
180   }
181
182   private static int fromHexChar(int c) throws IOException {
183      if (c >= '0' && c <= '9')
184         return c - '0';
185      if (c >= 'a' && c <= 'f')
186         return 10 + c - 'a';
187      if (c >= 'A' && c <= 'F')
188         return 10 + c - 'A';
189      throw new IOException("Invalid hex character '"+c+"' found in escape pattern.");
190   }
191
192   @Override /* ParserReader */
193   public final UonReader unread() throws IOException {
194      super.unread();
195      return this;
196   }
197}