Source code

001// ***************************************************************************************************************************
002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
003// * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
005// * with the License.  You may obtain a copy of the License at                                                              *
006// *                                                                                                                         *
007// *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
008// *                                                                                                                         *
009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
011// * specific language governing permissions and limitations under the License.                                              *
012// ***************************************************************************************************************************
013package org.apache.juneau.parser;
014
015import java.io.*;
016
017import org.apache.juneau.*;
018import org.apache.juneau.internal.*;
019
020/**
021 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character.
022 * 
023 * <p>
024 * Code is optimized to work with a 1 character buffer.
025 * 
026 * <p>
027 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture
028 * characters from the previous mark point.
029 * 
030 * <p>
031 * <b>Warning:</b>  Not thread safe.
032 */
033public class ParserReader extends Reader {
034
035   /** Wrapped reader */
036   protected final Reader r;
037   private final ParserPipe pipe;
038
039   private char[] buff;       // Internal character buffer
040   private int line = 1;      // Current line number
041   private int column;        // Current column number
042   private int iCurrent = 0;  // Current pointer into character buffer
043   private int iMark = -1;    // Mark position in buffer
044   private int iEnd = 0;      // The last good character position in the buffer
045   private boolean endReached, holesExist;
046   private final boolean unbuffered;
047
048   /**
049    * Constructor.
050    * 
051    * @param pipe The parser input.
052    * @throws IOException
053    */
054   public ParserReader(ParserPipe pipe) throws IOException {
055      this.pipe = pipe;
056      this.unbuffered = pipe.unbuffered;
057      if (pipe.isString()) {
058         String in = pipe.getInputAsString();
059         this.r = new CharSequenceReader(in);
060         this.buff = new char[in.length() < 1024 ? in.length() : 1024];
061      } else {
062         Reader _r = pipe.getReader();
063         if (_r instanceof ParserReader)
064            this.r = ((ParserReader)_r).r;
065         else
066            this.r = _r;
067         this.buff = new char[1024];
068      }
069   }
070
071
072   /**
073    * Returns the current line number position in this reader.
074    * 
075    * @return The current line number.
076    */
077   public final int getLine() {
078      return line;
079   }
080
081   /**
082    * Returns the current column number position in this reader.
083    * 
084    * @return The current column number.
085    */
086   public final int getColumn() {
087      return column;
088   }
089
090   /**
091    * Reads a single character.
092    * 
093    * <p>
094    * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather
095    * returns them as two <jk>char</jk>s.
096    * Use {@link #readCodePoint()} to ensure proper handling of extended unicode.
097    * 
098    * @return The character read, or -1 if the end of the stream has been reached.
099    * @throws IOException If a problem occurred trying to read from the reader.
100    */
101   @Override /* Reader */
102   public final int read() throws IOException {
103      int c = readFromBuff();
104      if (c == -1)
105         return -1;
106      if (c == '\n') {
107         line++;
108         column = 0;
109      } else {
110         column++;
111      }
112      return c;
113   }
114
115   /**
116    * Same as {@link #read()} but skips over any whitespace characters.
117    * 
118    * @return The first non-whitespace character, or -1 if the end of stream reached.
119    * @throws IOException
120    */
121   public final int readSkipWs() throws IOException {
122      while (true) {
123         int c = read();
124         if (c == -1 || ! Character.isWhitespace(c))
125            return c;
126      }
127   }
128
129   /**
130    * Same as {@link #read()} but detects and combines extended unicode characters (i.e. characters above 0x10000).
131    * 
132    * @return The character read, or -1 if the end of the stream has been reached.
133    * @throws IOException If a problem occurred trying to read from the reader.
134    */
135   public final int readCodePoint() throws IOException {
136      int c = read();
137
138      // Characters that take up 2 chars.
139      if (c >= 0xd800 && c <= 0xdbff) {
140         int low = read();
141         if (low >= 0xdc00 && low <= 0xdfff)
142            c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00);
143      }
144
145      return c;
146   }
147
148   private final int readFromBuff() throws IOException {
149      while (iCurrent >= iEnd) {
150         if (endReached)
151            return -1;
152
153         // If there's still space at the end of this buffer, fill it.
154         // Make sure there's at least 2 character spaces free for extended unicode characters.
155         //if (false) {
156         if (iEnd+1 < buff.length) {
157            int x = read(buff, iCurrent, buff.length-iEnd);
158            if (x == -1) {
159               endReached = true;
160               return -1;
161            }
162            iEnd += x;
163
164         } else {
165            // If we're currently marking, then we want to copy from the current mark point
166            // to the beginning of the buffer and then fill in the remainder of buffer.
167            if (iMark >= 0) {
168
169               // If we're marking from the beginning of the array, we double the size of the
170               // buffer.  This isn't likely to occur often.
171               if (iMark == 0) {
172                  char[] buff2 = new char[buff.length<<1];
173                  System.arraycopy(buff, 0, buff2, 0, buff.length);
174                  buff = buff2;
175
176               // Otherwise, we copy what's currently marked to the beginning of the buffer.
177               } else {
178                  int copyBuff = iMark;
179                  System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff);
180                  iCurrent -= copyBuff;
181                  iMark -= copyBuff;
182               }
183               int expected = buff.length - iCurrent;
184
185               int x = read(buff, iCurrent, expected);
186               if (x == -1) {
187                  endReached = true;
188                  iEnd = iCurrent;
189                  return -1;
190               }
191               iEnd = iCurrent + x;
192            } else {
193               // Copy the last 10 chars in the buffer to the beginning of the buffer.
194               int copyBuff = Math.min(iCurrent, 10);
195               System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff);
196
197               // Number of characters we expect to copy on the next read.
198               int expected = buff.length - copyBuff;
199               int x = read(buff, copyBuff, expected);
200               iCurrent = copyBuff;
201               if (x == -1) {
202                  endReached = true;
203                  iEnd = iCurrent;
204                  return -1;
205               }
206               iEnd = iCurrent + x;
207            }
208         }
209      }
210      return buff[iCurrent++];
211   }
212
213   /**
214    * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}.
215    */
216   public final void mark() {
217      iMark = iCurrent;
218   }
219
220   /**
221    * Peeks the next character in the stream.
222    * 
223    * <p>
224    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
225    * 
226    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
227    * @throws IOException If a problem occurred trying to read from the reader.
228    */
229   public final int peek() throws IOException {
230      int c = read();
231      if (c != -1)
232         unread();
233      return c;
234   }
235
236   /**
237    * Same as {@link #peek()} but skips over any whitespace characters.
238    * 
239    * <p>
240    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
241    * 
242    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
243    * @throws IOException If a problem occurred trying to read from the reader.
244    */
245   public final int peekSkipWs() throws IOException {
246      while(true) {
247         int c = read();
248         boolean isWs = Character.isWhitespace(c);
249         if (c != -1 && ! isWs)
250            unread();
251         if (! isWs)
252            return c;
253      }
254   }
255
256   /**
257    * Read the specified number of characters off the stream.
258    * 
259    * @param num The number of characters to read.
260    * @return The characters packaged as a String.
261    * @throws IOException If a problem occurred trying to read from the reader.
262    */
263   public final String read(int num) throws IOException {
264      char[] c = new char[num];
265      for (int i = 0; i < num; i++) {
266         int c2 = read();
267         if (c2 == -1)
268            return new String(c, 0, i);
269         c[i] = (char)c2;
270      }
271      return new String(c);
272   }
273
274   /**
275    * Pushes the last read character back into the stream.
276    * 
277    * @return This object (for method chaining).
278    * @throws IOException If a problem occurred trying to read from the reader.
279    */
280   public ParserReader unread() throws IOException {
281      if (iCurrent <= 0)
282         throw new IOException("Buffer underflow.");
283      iCurrent--;
284      column--;
285      return this;
286   }
287
288   /**
289    * No-op.
290    * 
291    * <p>
292    * Input readers are closed in the {@link ParserPipe} class.
293    * 
294    * @throws IOException If a problem occurred trying to read from the reader.
295    */
296   @Override /* Reader */
297   public void close() throws IOException {
298      // No-op
299   }
300
301   /**
302    * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage.
303    * 
304    * @return The contents of the reusable character buffer as a string.
305    */
306   public final String getMarked() {
307      return getMarked(0, 0);
308   }
309
310   /**
311    * Same as {@link #getMarked()} except allows you to specify offsets into the buffer.
312    * 
313    * <p>
314    * For example, to return the marked string, but trim the first and last characters, call the following:
315    * <p class='bcode'>
316    *    getFromMarked(1, -1);
317    * </p>
318    * 
319    * @param offsetStart The offset of the start position.
320    * @param offsetEnd The offset of the end position.
321    * @return The contents of the reusable character buffer as a string.
322    */
323   public final String getMarked(int offsetStart, int offsetEnd) {
324      int offset = 0;
325
326      // Holes are \u00FF 'delete' characters that we need to get rid of now.
327      if (holesExist) {
328         for (int i = iMark; i < iCurrent; i++) {
329            char c = buff[i];
330            if (c == 127)
331               offset++;
332            else
333               buff[i-offset] = c;
334         }
335         holesExist = false;
336      }
337      int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset;
338      String s = new String(buff, start, len);
339      iMark = -1;
340      return s;
341   }
342
343   /**
344    * Trims off the last character in the marking buffer.
345    * 
346    * <p>
347    * Useful for removing escape characters from sequences.
348    * 
349    * @return This object (for method chaining).
350    */
351   public final ParserReader delete() {
352      return delete(1);
353   }
354
355   /**
356    * Trims off the specified number of last characters in the marking buffer.
357    * Useful for removing escape characters from sequences.
358    * 
359    * @param count The number of characters to delete.
360    * @return This object (for method chaining).
361    */
362   public final ParserReader delete(int count) {
363      for (int i = 0; i < count; i++)
364         buff[iCurrent-i-1] = 127;
365      holesExist = true;
366      return this;
367   }
368
369   /**
370    * Replaces the last character in the marking buffer with the specified character.
371    * 
372    * <p>
373    * <code>offset</code> must be at least <code>1</code> for normal characters, and <code>2</code> for extended
374    * unicode characters in order for the replacement to fit into the buffer.
375    * 
376    * @param c The new character.
377    * @param offset The offset.
378    * @return This object (for method chaining).
379    * @throws IOException
380    */
381   public final ParserReader replace(int c, int offset) throws IOException {
382      if (c < 0x10000) {
383         if (offset < 1)
384            throw new IOException("Buffer underflow.");
385         buff[iCurrent-offset] = (char)c;
386      } else {
387         if (offset < 2)
388            throw new IOException("Buffer underflow.");
389         c -= 0x10000;
390         buff[iCurrent-offset] = (char)(0xd800 + (c >> 10));
391         buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff));
392         offset--;
393      }
394      // Fill in the gap with DEL characters.
395      for (int i = 1; i < offset; i++)
396         buff[iCurrent-i] = 127;
397      holesExist |= (offset > 1);
398      return this;
399   }
400
401   /**
402    * Replace the last read character in the buffer with the specified character.
403    * 
404    * @param c The new character.
405    * @return This object (for method chaining).
406    * @throws IOException
407    */
408   public final ParserReader replace(char c) throws IOException {
409      return replace(c, 1);
410   }
411
412   /**
413    * Subclasses can override this method to provide additional filtering.
414    * 
415    * <p>
416    * Default implementation simply calls the same method on the underlying reader.
417    */
418   @Override /* Reader */
419   public int read(char[] cbuf, int off, int len) throws IOException {
420      return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len);
421   }
422
423   /**
424    * Returns the combined location information on both this reader and the session.
425    * 
426    * @param session The session object to read the last location on.
427    * @return A new map describing the current parse location.
428    */
429   public ObjectMap getLocation(ParserSession session) {
430      return session.getLastLocation().append("line", getLine()).append("column", getColumn());
431   }
432
433   /**
434    * Returns the pipe that was passed into the constructor.
435    * 
436    * @return The pipe that was passed into the constructor.
437    */
438   public final ParserPipe getPipe() {
439      return pipe;
440   }
441}