001// ***************************************************************************************************************************
002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
003// * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
005// * with the License.  You may obtain a copy of the License at                                                              *
006// *                                                                                                                         *
007// *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
008// *                                                                                                                         *
009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
011// * specific language governing permissions and limitations under the License.                                              *
012// ***************************************************************************************************************************
013package org.apache.juneau.parser;
014
015import java.io.*;
016
017import org.apache.juneau.internal.*;
018
019/**
020 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character.
021 *
022 * <p>
023 * Code is optimized to work with a 1 character buffer.
024 *
025 * <p>
026 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture
027 * characters from the previous mark point.
028 *
029 * <p>
030 * <b>Warning:</b>  Not thread safe.
031 */
032public class ParserReader extends Reader implements Positionable {
033
034   /** Wrapped reader */
035   protected final Reader r;
036
037   private char[] buff;       // Internal character buffer
038   private int line = 1;      // Current line number
039   private int column;        // Current column number
040   private int iCurrent = 0;  // Current pointer into character buffer
041   private int iMark = -1;    // Mark position in buffer
042   private int iEnd = 0;      // The last good character position in the buffer
043   private boolean endReached, holesExist;
044   private final boolean unbuffered;
045
046   /**
047    * Constructor.
048    *
049    * @param pipe The parser input.
050    * @throws IOException Thrown by underlying stream.
051    */
052   public ParserReader(ParserPipe pipe) throws IOException {
053      this.unbuffered = pipe.unbuffered;
054      if (pipe.isString()) {
055         String in = pipe.getInputAsString();
056         this.r = new CharSequenceReader(in);
057         this.buff = new char[in.length() < 1024 ? in.length() : 1024];
058      } else {
059         Reader _r = pipe.getReader();
060         if (_r instanceof ParserReader)
061            this.r = ((ParserReader)_r).r;
062         else
063            this.r = _r;
064         this.buff = new char[1024];
065      }
066      pipe.setPositionable(this);
067   }
068
069   /**
070    * Reads a single character.
071    *
072    * <p>
073    * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather
074    * returns them as two <jk>char</jk>s.
075    * Use {@link #readCodePoint()} to ensure proper handling of extended unicode.
076    *
077    * @return The character read, or -1 if the end of the stream has been reached.
078    * @throws IOException If a problem occurred trying to read from the reader.
079    */
080   @Override /* Reader */
081   public final int read() throws IOException {
082      int c = readFromBuff();
083      if (c == -1)
084         return -1;
085      if (c == '\n') {
086         line++;
087         column = 0;
088      } else {
089         column++;
090      }
091      return c;
092   }
093
094   /**
095    * Same as {@link #read()} but skips over any whitespace characters.
096    *
097    * @return The first non-whitespace character, or -1 if the end of stream reached.
098    * @throws IOException Thrown by underlying stream.
099    */
100   public final int readSkipWs() throws IOException {
101      while (true) {
102         int c = read();
103         if (c == -1 || ! Character.isWhitespace(c))
104            return c;
105      }
106   }
107
108   /**
109    * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000).
110    *
111    * @return The character read, or -1 if the end of the stream has been reached.
112    * @throws IOException If a problem occurred trying to read from the reader.
113    */
114   public final int readCodePoint() throws IOException {
115      int c = read();
116
117      // Characters that take up 2 chars.
118      if (c >= 0xd800 && c <= 0xdbff) {
119         int low = read();
120         if (low >= 0xdc00 && low <= 0xdfff)
121            c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00);
122      }
123
124      return c;
125   }
126
127   private final int readFromBuff() throws IOException {
128      while (iCurrent >= iEnd) {
129         if (endReached)
130            return -1;
131
132         // If there's still space at the end of this buffer, fill it.
133         // Make sure there's at least 2 character spaces free for extended unicode characters.
134         //if (false) {
135         if (iEnd+1 < buff.length) {
136            int x = read(buff, iCurrent, buff.length-iEnd);
137            if (x == -1) {
138               endReached = true;
139               return -1;
140            }
141            iEnd += x;
142
143         } else {
144            // If we're currently marking, then we want to copy from the current mark point
145            // to the beginning of the buffer and then fill in the remainder of buffer.
146            if (iMark >= 0) {
147
148               // If we're marking from the beginning of the array, we double the size of the
149               // buffer.  This isn't likely to occur often.
150               if (iMark == 0) {
151                  char[] buff2 = new char[buff.length<<1];
152                  System.arraycopy(buff, 0, buff2, 0, buff.length);
153                  buff = buff2;
154
155               // Otherwise, we copy what's currently marked to the beginning of the buffer.
156               } else {
157                  int copyBuff = iMark;
158                  System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff);
159                  iCurrent -= copyBuff;
160                  iMark -= copyBuff;
161               }
162               int expected = buff.length - iCurrent;
163
164               int x = read(buff, iCurrent, expected);
165               if (x == -1) {
166                  endReached = true;
167                  iEnd = iCurrent;
168                  return -1;
169               }
170               iEnd = iCurrent + x;
171            } else {
172               // Copy the last 10 chars in the buffer to the beginning of the buffer.
173               int copyBuff = Math.min(iCurrent, 10);
174               System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff);
175
176               // Number of characters we expect to copy on the next read.
177               int expected = buff.length - copyBuff;
178               int x = read(buff, copyBuff, expected);
179               iCurrent = copyBuff;
180               if (x == -1) {
181                  endReached = true;
182                  iEnd = iCurrent;
183                  return -1;
184               }
185               iEnd = iCurrent + x;
186            }
187         }
188      }
189      return buff[iCurrent++];
190   }
191
192   /**
193    * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}.
194    */
195   public final void mark() {
196      iMark = iCurrent;
197   }
198
199   /**
200    * Peeks the next character in the stream.
201    *
202    * <p>
203    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
204    *
205    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
206    * @throws IOException If a problem occurred trying to read from the reader.
207    */
208   public final int peek() throws IOException {
209      int c = read();
210      if (c != -1)
211         unread();
212      return c;
213   }
214
215   /**
216    * Same as {@link #peek()} but skips over any whitespace characters.
217    *
218    * <p>
219    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
220    *
221    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
222    * @throws IOException If a problem occurred trying to read from the reader.
223    */
224   public final int peekSkipWs() throws IOException {
225      while(true) {
226         int c = read();
227         boolean isWs = Character.isWhitespace(c);
228         if (c != -1 && ! isWs)
229            unread();
230         if (! isWs)
231            return c;
232      }
233   }
234
235   /**
236    * Read the specified number of characters off the stream.
237    *
238    * @param num The number of characters to read.
239    * @return The characters packaged as a String.
240    * @throws IOException If a problem occurred trying to read from the reader.
241    */
242   public final String read(int num) throws IOException {
243      char[] c = new char[num];
244      for (int i = 0; i < num; i++) {
245         int c2 = read();
246         if (c2 == -1)
247            return new String(c, 0, i);
248         c[i] = (char)c2;
249      }
250      return new String(c);
251   }
252
253   /**
254    * Pushes the last read character back into the stream.
255    *
256    * @return This object (for method chaining).
257    * @throws IOException If a problem occurred trying to read from the reader.
258    */
259   public ParserReader unread() throws IOException {
260      if (iCurrent <= 0)
261         throw new IOException("Buffer underflow.");
262      iCurrent--;
263      if (column == 0)
264         line--;
265      else
266         column--;
267      return this;
268   }
269
270   /**
271    * No-op.
272    *
273    * <p>
274    * Input readers are closed in the {@link ParserPipe} class.
275    *
276    * @throws IOException If a problem occurred trying to read from the reader.
277    */
278   @Override /* Reader */
279   public void close() throws IOException {
280      // No-op
281   }
282
283   /**
284    * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage.
285    *
286    * @return The contents of the reusable character buffer as a string.
287    */
288   public final String getMarked() {
289      return getMarked(0, 0);
290   }
291
292   /**
293    * Same as {@link #getMarked()} except allows you to specify offsets into the buffer.
294    *
295    * <p>
296    * For example, to return the marked string, but trim the first and last characters, call the following:
297    * <p class='bcode w800'>
298    *    getFromMarked(1, -1);
299    * </p>
300    *
301    * @param offsetStart The offset of the start position.
302    * @param offsetEnd The offset of the end position.
303    * @return The contents of the reusable character buffer as a string.
304    */
305   public final String getMarked(int offsetStart, int offsetEnd) {
306      int offset = 0;
307
308      // Holes are \u00FF 'delete' characters that we need to get rid of now.
309      if (holesExist) {
310         for (int i = iMark; i < iCurrent; i++) {
311            char c = buff[i];
312            if (c == 127)
313               offset++;
314            else
315               buff[i-offset] = c;
316         }
317         holesExist = false;
318      }
319      int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset;
320      String s = new String(buff, start, len);
321      iMark = -1;
322      return s;
323   }
324
325   /**
326    * Trims off the last character in the marking buffer.
327    *
328    * <p>
329    * Useful for removing escape characters from sequences.
330    *
331    * @return This object (for method chaining).
332    */
333   public final ParserReader delete() {
334      return delete(1);
335   }
336
337   /**
338    * Trims off the specified number of last characters in the marking buffer.
339    * Useful for removing escape characters from sequences.
340    *
341    * @param count The number of characters to delete.
342    * @return This object (for method chaining).
343    */
344   public final ParserReader delete(int count) {
345      for (int i = 0; i < count; i++)
346         buff[iCurrent-i-1] = 127;
347      holesExist = true;
348      return this;
349   }
350
351   /**
352    * Replaces the last character in the marking buffer with the specified character.
353    *
354    * <p>
355    * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended
356    * unicode characters in order for the replacement to fit into the buffer.
357    *
358    * @param c The new character.
359    * @param offset The offset.
360    * @return This object (for method chaining).
361    * @throws IOException Thrown by underlying stream.
362    */
363   public final ParserReader replace(int c, int offset) throws IOException {
364      if (c < 0x10000) {
365         if (offset < 1)
366            throw new IOException("Buffer underflow.");
367         buff[iCurrent-offset] = (char)c;
368      } else {
369         if (offset < 2)
370            throw new IOException("Buffer underflow.");
371         c -= 0x10000;
372         buff[iCurrent-offset] = (char)(0xd800 + (c >> 10));
373         buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff));
374         offset--;
375      }
376      // Fill in the gap with DEL characters.
377      for (int i = 1; i < offset; i++)
378         buff[iCurrent-i] = 127;
379      holesExist |= (offset > 1);
380      return this;
381   }
382
383   /**
384    * Replace the last read character in the buffer with the specified character.
385    *
386    * @param c The new character.
387    * @return This object (for method chaining).
388    * @throws IOException Thrown by underlying stream.
389    */
390   public final ParserReader replace(char c) throws IOException {
391      return replace(c, 1);
392   }
393
394   /**
395    * Subclasses can override this method to provide additional filtering.
396    *
397    * <p>
398    * Default implementation simply calls the same method on the underlying reader.
399    */
400   @Override /* Reader */
401   public int read(char[] cbuf, int off, int len) throws IOException {
402      return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len);
403   }
404
405   @Override /* Positionable */
406   public Position getPosition() {
407      return new Position(line, column);
408   }
409}