001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.juneau.parser;
018
019import java.io.*;
020
021import org.apache.juneau.common.utils.*;
022import org.apache.juneau.internal.*;
023
024/**
025 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character.
026 *
027 * <p>
028 * Code is optimized to work with a 1 character buffer.
029 *
030 * <p>
031 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture
032 * characters from the previous mark point.
033 *
034 * <h5 class='section'>Notes:</h5><ul>
035 *    <li class='warn'>This class is not thread safe.
036 * </ul>
037 *
038 * <h5 class='section'>See Also:</h5><ul>
039 *    <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/SerializersAndParsers">Serializers and Parsers</a>
040
041 * </ul>
042 */
043public class ParserReader extends Reader implements Positionable {
044
045   /** Wrapped reader */
046   protected final Reader r;
047
048   private char[] buff;       // Internal character buffer
049   private int line = 1;      // Current line number
050   private int column;        // Current column number
051   private int iCurrent;      // Current pointer into character buffer
052   private int iMark = -1;    // Mark position in buffer
053   private int iEnd;          // The last good character position in the buffer
054   private boolean endReached, holesExist;
055   private final boolean unbuffered;
056
057   /**
058    * Constructor.
059    *
060    * @param pipe The parser input.
061    * @throws IOException Thrown by underlying stream.
062    */
063   public ParserReader(ParserPipe pipe) throws IOException {
064      this.unbuffered = pipe.unbuffered;
065      if (pipe.isString()) {
066         String in = pipe.getInputAsString();
067         this.r = new CharSequenceReader(in);
068         this.buff = new char[in.length() < 1024 ? in.length() : 1024];
069      } else {
070         Reader _r = pipe.getReader();
071         if (_r instanceof ParserReader)
072            this.r = ((ParserReader)_r).r;
073         else
074            this.r = _r;
075         this.buff = new char[1024];
076      }
077      pipe.setPositionable(this);
078   }
079
080   /**
081    * Reads a single character.
082    *
083    * <p>
084    * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather
085    * returns them as two <jk>char</jk>s.
086    * Use {@link #readCodePoint()} to ensure proper handling of extended unicode.
087    *
088    * @return The character read, or -1 if the end of the stream has been reached.
089    * @throws IOException If a problem occurred trying to read from the reader.
090    */
091   @Override /* Reader */
092   public final int read() throws IOException {
093      int c = readFromBuff();
094      if (c == -1)
095         return -1;
096      if (c == '\n') {
097         line++;
098         column = 0;
099      } else {
100         column++;
101      }
102      return c;
103   }
104
105   /**
106    * Same as {@link #read()} but skips over any whitespace characters.
107    *
108    * @return The first non-whitespace character, or -1 if the end of stream reached.
109    * @throws IOException Thrown by underlying stream.
110    */
111   public final int readSkipWs() throws IOException {
112      while (true) {
113         int c = read();
114         if (c == -1 || ! Character.isWhitespace(c))
115            return c;
116      }
117   }
118
119   /**
120    * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000).
121    *
122    * @return The character read, or -1 if the end of the stream has been reached.
123    * @throws IOException If a problem occurred trying to read from the reader.
124    */
125   public final int readCodePoint() throws IOException {
126      int c = read();
127
128      // Characters that take up 2 chars.
129      if (c >= 0xd800 && c <= 0xdbff) {
130         int low = read();
131         if (low >= 0xdc00 && low <= 0xdfff)
132            c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00);
133      }
134
135      return c;
136   }
137
138   private final int readFromBuff() throws IOException {
139      while (iCurrent >= iEnd) {
140         if (endReached)
141            return -1;
142
143         // If there's still space at the end of this buffer, fill it.
144         // Make sure there's at least 2 character spaces free for extended unicode characters.
145         //if (false) {
146         if (iEnd+1 < buff.length) {
147            int x = read(buff, iCurrent, buff.length-iEnd);
148            if (x == -1) {
149               endReached = true;
150               return -1;
151            }
152            iEnd += x;
153
154         } else {
155            // If we're currently marking, then we want to copy from the current mark point
156            // to the beginning of the buffer and then fill in the remainder of buffer.
157            if (iMark >= 0) {
158
159               // If we're marking from the beginning of the array, we double the size of the
160               // buffer.  This isn't likely to occur often.
161               if (iMark == 0) {
162                  char[] buff2 = new char[buff.length<<1];
163                  System.arraycopy(buff, 0, buff2, 0, buff.length);
164                  buff = buff2;
165
166               // Otherwise, we copy what's currently marked to the beginning of the buffer.
167               } else {
168                  int copyBuff = iMark;
169                  System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff);
170                  iCurrent -= copyBuff;
171                  iMark -= copyBuff;
172               }
173               int expected = buff.length - iCurrent;
174
175               int x = read(buff, iCurrent, expected);
176               if (x == -1) {
177                  endReached = true;
178                  iEnd = iCurrent;
179                  return -1;
180               }
181               iEnd = iCurrent + x;
182            } else {
183               // Copy the last 10 chars in the buffer to the beginning of the buffer.
184               int copyBuff = Math.min(iCurrent, 10);
185               System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff);
186
187               // Number of characters we expect to copy on the next read.
188               int expected = buff.length - copyBuff;
189               int x = read(buff, copyBuff, expected);
190               iCurrent = copyBuff;
191               if (x == -1) {
192                  endReached = true;
193                  iEnd = iCurrent;
194                  return -1;
195               }
196               iEnd = iCurrent + x;
197            }
198         }
199      }
200      return buff[iCurrent++];
201   }
202
203   /**
204    * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}.
205    */
206   public final void mark() {
207      iMark = iCurrent;
208   }
209
210   /**
211    * Peeks the next character in the stream.
212    *
213    * <p>
214    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
215    *
216    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
217    * @throws IOException If a problem occurred trying to read from the reader.
218    */
219   public final int peek() throws IOException {
220      int c = read();
221      if (c != -1)
222         unread();
223      return c;
224   }
225
226   /**
227    * Same as {@link #peek()} but skips over any whitespace characters.
228    *
229    * <p>
230    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
231    *
232    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
233    * @throws IOException If a problem occurred trying to read from the reader.
234    */
235   public final int peekSkipWs() throws IOException {
236      while(true) {
237         int c = read();
238         boolean isWs = Character.isWhitespace(c);
239         if (c != -1 && ! isWs)
240            unread();
241         if (! isWs)
242            return c;
243      }
244   }
245
246   /**
247    * Read the specified number of characters off the stream.
248    *
249    * @param num The number of characters to read.
250    * @return The characters packaged as a String.
251    * @throws IOException If a problem occurred trying to read from the reader.
252    */
253   public final String read(int num) throws IOException {
254      char[] c = new char[num];
255      for (int i = 0; i < num; i++) {
256         int c2 = read();
257         if (c2 == -1)
258            return new String(c, 0, i);
259         c[i] = (char)c2;
260      }
261      return new String(c);
262   }
263
264   /**
265    * Pushes the last read character back into the stream.
266    *
267    * @return This object.
268    * @throws IOException If a problem occurred trying to read from the reader.
269    */
270   public ParserReader unread() throws IOException {
271      if (iCurrent <= 0)
272         throw new IOException("Buffer underflow.");
273      iCurrent--;
274      if (column == 0)
275         line--;
276      else
277         column--;
278      return this;
279   }
280
281   /**
282    * No-op.
283    *
284    * <p>
285    * Input readers are closed in the {@link ParserPipe} class.
286    *
287    * @throws IOException If a problem occurred trying to read from the reader.
288    */
289   @Override /* Reader */
290   public void close() throws IOException {
291      // No-op
292   }
293
294   /**
295    * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage.
296    *
297    * @return The contents of the reusable character buffer as a string.
298    */
299   public final String getMarked() {
300      return getMarked(0, 0);
301   }
302
303   /**
304    * Same as {@link #getMarked()} except allows you to specify offsets into the buffer.
305    *
306    * <p>
307    * For example, to return the marked string, but trim the first and last characters, call the following:
308    * <p class='bjava'>
309    *    getFromMarked(1, -1);
310    * </p>
311    *
312    * @param offsetStart The offset of the start position.
313    * @param offsetEnd The offset of the end position.
314    * @return The contents of the reusable character buffer as a string.
315    */
316   public final String getMarked(int offsetStart, int offsetEnd) {
317      int offset = 0;
318
319      // Holes are \u00FF 'delete' characters that we need to get rid of now.
320      if (holesExist) {
321         for (int i = iMark; i < iCurrent; i++) {
322            char c = buff[i];
323            if (c == 127)
324               offset++;
325            else
326               buff[i-offset] = c;
327         }
328         holesExist = false;
329      }
330      int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset;
331      String s = new String(buff, start, len);
332      iMark = -1;
333      return s;
334   }
335
336   /**
337    * Trims off the last character in the marking buffer.
338    *
339    * <p>
340    * Useful for removing escape characters from sequences.
341    *
342    * @return This object.
343    */
344   public final ParserReader delete() {
345      return delete(1);
346   }
347
348   /**
349    * Trims off the specified number of last characters in the marking buffer.
350    * Useful for removing escape characters from sequences.
351    *
352    * @param count The number of characters to delete.
353    * @return This object.
354    */
355   public final ParserReader delete(int count) {
356      for (int i = 0; i < count; i++)
357         buff[iCurrent-i-1] = 127;
358      holesExist = true;
359      return this;
360   }
361
362   /**
363    * Replaces the last character in the marking buffer with the specified character.
364    *
365    * <p>
366    * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended
367    * unicode characters in order for the replacement to fit into the buffer.
368    *
369    * @param c The new character.
370    * @param offset The offset.
371    * @return This object.
372    * @throws IOException Thrown by underlying stream.
373    */
374   public final ParserReader replace(int c, int offset) throws IOException {
375      if (c < 0x10000) {
376         if (offset < 1)
377            throw new IOException("Buffer underflow.");
378         buff[iCurrent-offset] = (char)c;
379      } else {
380         if (offset < 2)
381            throw new IOException("Buffer underflow.");
382         c -= 0x10000;
383         buff[iCurrent-offset] = (char)(0xd800 + (c >> 10));
384         buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff));
385         offset--;
386      }
387      // Fill in the gap with DEL characters.
388      for (int i = 1; i < offset; i++)
389         buff[iCurrent-i] = 127;
390      holesExist |= (offset > 1);
391      return this;
392   }
393
394   /**
395    * Replace the last read character in the buffer with the specified character.
396    *
397    * @param c The new character.
398    * @return This object.
399    * @throws IOException Thrown by underlying stream.
400    */
401   public final ParserReader replace(char c) throws IOException {
402      return replace(c, 1);
403   }
404   /**
405    * Reads a numeric string from the specified reader.
406    *
407    * @return The parsed number string.
408    * @throws IOException Thrown by underlying stream.
409    */
410   public String parseNumberString() throws IOException {
411      mark();
412      int c = 0;
413      while (true) {
414         c = read();
415         if (c == -1)
416            break;
417         if (! StringUtils.isNumberChar((char)c)) {
418            unread();
419            break;
420         }
421      }
422      return getMarked();
423   }
424
425   /**
426    * Subclasses can override this method to provide additional filtering.
427    *
428    * <p>
429    * Default implementation simply calls the same method on the underlying reader.
430    */
431   @Override /* Reader */
432   public int read(char[] cbuf, int off, int len) throws IOException {
433      return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len);
434   }
435
436   @Override /* Positionable */
437   public Position getPosition() {
438      return new Position(line, column);
439   }
440}