001// ***************************************************************************************************************************
002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
003// * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
005// * with the License.  You may obtain a copy of the License at                                                              *
006// *                                                                                                                         *
007// *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
008// *                                                                                                                         *
009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
011// * specific language governing permissions and limitations under the License.                                              *
012// ***************************************************************************************************************************
013package org.apache.juneau.parser;
014
015import java.io.*;
016
017import org.apache.juneau.*;
018import org.apache.juneau.internal.*;
019
020/**
021 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character.
022 *
023 * <p>
024 * Code is optimized to work with a 1 character buffer.
025 *
026 * <p>
027 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture
028 * characters from the previous mark point.
029 *
030 * <p>
031 * <b>Warning:</b>  Not thread safe.
032 */
033public class ParserReader extends Reader implements Positionable {
034
035   /** Wrapped reader */
036   protected final Reader r;
037
038   private char[] buff;       // Internal character buffer
039   private int line = 1;      // Current line number
040   private int column;        // Current column number
041   private int iCurrent = 0;  // Current pointer into character buffer
042   private int iMark = -1;    // Mark position in buffer
043   private int iEnd = 0;      // The last good character position in the buffer
044   private boolean endReached, holesExist;
045   private final boolean unbuffered;
046
047   /**
048    * Constructor.
049    *
050    * @param pipe The parser input.
051    * @throws IOException
052    */
053   public ParserReader(ParserPipe pipe) throws IOException {
054      this.unbuffered = pipe.unbuffered;
055      if (pipe.isString()) {
056         String in = pipe.getInputAsString();
057         this.r = new CharSequenceReader(in);
058         this.buff = new char[in.length() < 1024 ? in.length() : 1024];
059      } else {
060         Reader _r = pipe.getReader();
061         if (_r instanceof ParserReader)
062            this.r = ((ParserReader)_r).r;
063         else
064            this.r = _r;
065         this.buff = new char[1024];
066      }
067      pipe.setPositionable(this);
068   }
069
070   /**
071    * Reads a single character.
072    *
073    * <p>
074    * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather
075    * returns them as two <jk>char</jk>s.
076    * Use {@link #readCodePoint()} to ensure proper handling of extended unicode.
077    *
078    * @return The character read, or -1 if the end of the stream has been reached.
079    * @throws IOException If a problem occurred trying to read from the reader.
080    */
081   @Override /* Reader */
082   public final int read() throws IOException {
083      int c = readFromBuff();
084      if (c == -1)
085         return -1;
086      if (c == '\n') {
087         line++;
088         column = 0;
089      } else {
090         column++;
091      }
092      return c;
093   }
094
095   /**
096    * Same as {@link #read()} but skips over any whitespace characters.
097    *
098    * @return The first non-whitespace character, or -1 if the end of stream reached.
099    * @throws IOException
100    */
101   public final int readSkipWs() throws IOException {
102      while (true) {
103         int c = read();
104         if (c == -1 || ! Character.isWhitespace(c))
105            return c;
106      }
107   }
108
109   /**
110    * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000).
111    *
112    * @return The character read, or -1 if the end of the stream has been reached.
113    * @throws IOException If a problem occurred trying to read from the reader.
114    */
115   public final int readCodePoint() throws IOException {
116      int c = read();
117
118      // Characters that take up 2 chars.
119      if (c >= 0xd800 && c <= 0xdbff) {
120         int low = read();
121         if (low >= 0xdc00 && low <= 0xdfff)
122            c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00);
123      }
124
125      return c;
126   }
127
128   private final int readFromBuff() throws IOException {
129      while (iCurrent >= iEnd) {
130         if (endReached)
131            return -1;
132
133         // If there's still space at the end of this buffer, fill it.
134         // Make sure there's at least 2 character spaces free for extended unicode characters.
135         //if (false) {
136         if (iEnd+1 < buff.length) {
137            int x = read(buff, iCurrent, buff.length-iEnd);
138            if (x == -1) {
139               endReached = true;
140               return -1;
141            }
142            iEnd += x;
143
144         } else {
145            // If we're currently marking, then we want to copy from the current mark point
146            // to the beginning of the buffer and then fill in the remainder of buffer.
147            if (iMark >= 0) {
148
149               // If we're marking from the beginning of the array, we double the size of the
150               // buffer.  This isn't likely to occur often.
151               if (iMark == 0) {
152                  char[] buff2 = new char[buff.length<<1];
153                  System.arraycopy(buff, 0, buff2, 0, buff.length);
154                  buff = buff2;
155
156               // Otherwise, we copy what's currently marked to the beginning of the buffer.
157               } else {
158                  int copyBuff = iMark;
159                  System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff);
160                  iCurrent -= copyBuff;
161                  iMark -= copyBuff;
162               }
163               int expected = buff.length - iCurrent;
164
165               int x = read(buff, iCurrent, expected);
166               if (x == -1) {
167                  endReached = true;
168                  iEnd = iCurrent;
169                  return -1;
170               }
171               iEnd = iCurrent + x;
172            } else {
173               // Copy the last 10 chars in the buffer to the beginning of the buffer.
174               int copyBuff = Math.min(iCurrent, 10);
175               System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff);
176
177               // Number of characters we expect to copy on the next read.
178               int expected = buff.length - copyBuff;
179               int x = read(buff, copyBuff, expected);
180               iCurrent = copyBuff;
181               if (x == -1) {
182                  endReached = true;
183                  iEnd = iCurrent;
184                  return -1;
185               }
186               iEnd = iCurrent + x;
187            }
188         }
189      }
190      return buff[iCurrent++];
191   }
192
193   /**
194    * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}.
195    */
196   public final void mark() {
197      iMark = iCurrent;
198   }
199
200   /**
201    * Peeks the next character in the stream.
202    *
203    * <p>
204    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
205    *
206    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
207    * @throws IOException If a problem occurred trying to read from the reader.
208    */
209   public final int peek() throws IOException {
210      int c = read();
211      if (c != -1)
212         unread();
213      return c;
214   }
215
216   /**
217    * Same as {@link #peek()} but skips over any whitespace characters.
218    *
219    * <p>
220    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
221    *
222    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
223    * @throws IOException If a problem occurred trying to read from the reader.
224    */
225   public final int peekSkipWs() throws IOException {
226      while(true) {
227         int c = read();
228         boolean isWs = Character.isWhitespace(c);
229         if (c != -1 && ! isWs)
230            unread();
231         if (! isWs)
232            return c;
233      }
234   }
235
236   /**
237    * Read the specified number of characters off the stream.
238    *
239    * @param num The number of characters to read.
240    * @return The characters packaged as a String.
241    * @throws IOException If a problem occurred trying to read from the reader.
242    */
243   public final String read(int num) throws IOException {
244      char[] c = new char[num];
245      for (int i = 0; i < num; i++) {
246         int c2 = read();
247         if (c2 == -1)
248            return new String(c, 0, i);
249         c[i] = (char)c2;
250      }
251      return new String(c);
252   }
253
254   /**
255    * Pushes the last read character back into the stream.
256    *
257    * @return This object (for method chaining).
258    * @throws IOException If a problem occurred trying to read from the reader.
259    */
260   public ParserReader unread() throws IOException {
261      if (iCurrent <= 0)
262         throw new IOException("Buffer underflow.");
263      iCurrent--;
264      if (column == 0)
265         line--;
266      else
267         column--;
268      return this;
269   }
270
271   /**
272    * No-op.
273    *
274    * <p>
275    * Input readers are closed in the {@link ParserPipe} class.
276    *
277    * @throws IOException If a problem occurred trying to read from the reader.
278    */
279   @Override /* Reader */
280   public void close() throws IOException {
281      // No-op
282   }
283
284   /**
285    * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage.
286    *
287    * @return The contents of the reusable character buffer as a string.
288    */
289   public final String getMarked() {
290      return getMarked(0, 0);
291   }
292
293   /**
294    * Same as {@link #getMarked()} except allows you to specify offsets into the buffer.
295    *
296    * <p>
297    * For example, to return the marked string, but trim the first and last characters, call the following:
298    * <p class='bcode w800'>
299    *    getFromMarked(1, -1);
300    * </p>
301    *
302    * @param offsetStart The offset of the start position.
303    * @param offsetEnd The offset of the end position.
304    * @return The contents of the reusable character buffer as a string.
305    */
306   public final String getMarked(int offsetStart, int offsetEnd) {
307      int offset = 0;
308
309      // Holes are \u00FF 'delete' characters that we need to get rid of now.
310      if (holesExist) {
311         for (int i = iMark; i < iCurrent; i++) {
312            char c = buff[i];
313            if (c == 127)
314               offset++;
315            else
316               buff[i-offset] = c;
317         }
318         holesExist = false;
319      }
320      int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset;
321      String s = new String(buff, start, len);
322      iMark = -1;
323      return s;
324   }
325
326   /**
327    * Trims off the last character in the marking buffer.
328    *
329    * <p>
330    * Useful for removing escape characters from sequences.
331    *
332    * @return This object (for method chaining).
333    */
334   public final ParserReader delete() {
335      return delete(1);
336   }
337
338   /**
339    * Trims off the specified number of last characters in the marking buffer.
340    * Useful for removing escape characters from sequences.
341    *
342    * @param count The number of characters to delete.
343    * @return This object (for method chaining).
344    */
345   public final ParserReader delete(int count) {
346      for (int i = 0; i < count; i++)
347         buff[iCurrent-i-1] = 127;
348      holesExist = true;
349      return this;
350   }
351
352   /**
353    * Replaces the last character in the marking buffer with the specified character.
354    *
355    * <p>
356    * <code>offset</code> must be at least <code>1</code> for normal characters, and <code>2</code> for extended
357    * unicode characters in order for the replacement to fit into the buffer.
358    *
359    * @param c The new character.
360    * @param offset The offset.
361    * @return This object (for method chaining).
362    * @throws IOException
363    */
364   public final ParserReader replace(int c, int offset) throws IOException {
365      if (c < 0x10000) {
366         if (offset < 1)
367            throw new IOException("Buffer underflow.");
368         buff[iCurrent-offset] = (char)c;
369      } else {
370         if (offset < 2)
371            throw new IOException("Buffer underflow.");
372         c -= 0x10000;
373         buff[iCurrent-offset] = (char)(0xd800 + (c >> 10));
374         buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff));
375         offset--;
376      }
377      // Fill in the gap with DEL characters.
378      for (int i = 1; i < offset; i++)
379         buff[iCurrent-i] = 127;
380      holesExist |= (offset > 1);
381      return this;
382   }
383
384   /**
385    * Replace the last read character in the buffer with the specified character.
386    *
387    * @param c The new character.
388    * @return This object (for method chaining).
389    * @throws IOException
390    */
391   public final ParserReader replace(char c) throws IOException {
392      return replace(c, 1);
393   }
394
395   /**
396    * Subclasses can override this method to provide additional filtering.
397    *
398    * <p>
399    * Default implementation simply calls the same method on the underlying reader.
400    */
401   @Override /* Reader */
402   public int read(char[] cbuf, int off, int len) throws IOException {
403      return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len);
404   }
405
406   @Override /* Positionable */
407   public Position getPosition() {
408      return new Position(line, column);
409   }
410
411   /**
412    * @deprecated Unused.
413    */
414   @SuppressWarnings("javadoc")
415   @Deprecated
416   public final int getLine() {
417      return line;
418   }
419
420   /**
421    * @deprecated Unused.
422    */
423   @SuppressWarnings("javadoc")
424   @Deprecated
425   public final int getColumn() {
426      return column;
427   }
428
429   /**
430    * @deprecated Unused.
431    */
432   @SuppressWarnings("javadoc")
433   @Deprecated
434   public ObjectMap getLocation(ParserSession session) {
435      return session.getLastLocation().append("line", getLine()).append("column", getColumn());
436   }
437
438   /**
439    * @deprecated Unused.
440    */
441   @SuppressWarnings("javadoc")
442   @Deprecated
443   public final ParserPipe getPipe() {
444      return null;
445   }
446}