Source code

001// ***************************************************************************************************************************
002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
003// * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
005// * with the License.  You may obtain a copy of the License at                                                              *
006// *                                                                                                                         *
007// *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
008// *                                                                                                                         *
009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
011// * specific language governing permissions and limitations under the License.                                              *
012// ***************************************************************************************************************************
013package org.apache.juneau.parser;
014
015import java.io.*;
016
017import org.apache.juneau.common.internal.*;
018import org.apache.juneau.internal.*;
019
020/**
021 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character.
022 *
023 * <p>
024 * Code is optimized to work with a 1 character buffer.
025 *
026 * <p>
027 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture
028 * characters from the previous mark point.
029 *
030 * <h5 class='section'>Notes:</h5><ul>
031 *    <li class='warn'>This class is not thread safe.
032 * </ul>
033 *
034 * <h5 class='section'>See Also:</h5><ul>
035 *    <li class='link'><a class="doclink" href="../../../../index.html#jm.SerializersAndParsers">Serializers and Parsers</a>
036
037 * </ul>
038 */
039public class ParserReader extends Reader implements Positionable {
040
041   /** Wrapped reader */
042   protected final Reader r;
043
044   private char[] buff;       // Internal character buffer
045   private int line = 1;      // Current line number
046   private int column;        // Current column number
047   private int iCurrent = 0;  // Current pointer into character buffer
048   private int iMark = -1;    // Mark position in buffer
049   private int iEnd = 0;      // The last good character position in the buffer
050   private boolean endReached, holesExist;
051   private final boolean unbuffered;
052
053   /**
054    * Constructor.
055    *
056    * @param pipe The parser input.
057    * @throws IOException Thrown by underlying stream.
058    */
059   public ParserReader(ParserPipe pipe) throws IOException {
060      this.unbuffered = pipe.unbuffered;
061      if (pipe.isString()) {
062         String in = pipe.getInputAsString();
063         this.r = new CharSequenceReader(in);
064         this.buff = new char[in.length() < 1024 ? in.length() : 1024];
065      } else {
066         Reader _r = pipe.getReader();
067         if (_r instanceof ParserReader)
068            this.r = ((ParserReader)_r).r;
069         else
070            this.r = _r;
071         this.buff = new char[1024];
072      }
073      pipe.setPositionable(this);
074   }
075
076   /**
077    * Reads a single character.
078    *
079    * <p>
080    * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather
081    * returns them as two <jk>char</jk>s.
082    * Use {@link #readCodePoint()} to ensure proper handling of extended unicode.
083    *
084    * @return The character read, or -1 if the end of the stream has been reached.
085    * @throws IOException If a problem occurred trying to read from the reader.
086    */
087   @Override /* Reader */
088   public final int read() throws IOException {
089      int c = readFromBuff();
090      if (c == -1)
091         return -1;
092      if (c == '\n') {
093         line++;
094         column = 0;
095      } else {
096         column++;
097      }
098      return c;
099   }
100
101   /**
102    * Same as {@link #read()} but skips over any whitespace characters.
103    *
104    * @return The first non-whitespace character, or -1 if the end of stream reached.
105    * @throws IOException Thrown by underlying stream.
106    */
107   public final int readSkipWs() throws IOException {
108      while (true) {
109         int c = read();
110         if (c == -1 || ! Character.isWhitespace(c))
111            return c;
112      }
113   }
114
115   /**
116    * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000).
117    *
118    * @return The character read, or -1 if the end of the stream has been reached.
119    * @throws IOException If a problem occurred trying to read from the reader.
120    */
121   public final int readCodePoint() throws IOException {
122      int c = read();
123
124      // Characters that take up 2 chars.
125      if (c >= 0xd800 && c <= 0xdbff) {
126         int low = read();
127         if (low >= 0xdc00 && low <= 0xdfff)
128            c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00);
129      }
130
131      return c;
132   }
133
134   private final int readFromBuff() throws IOException {
135      while (iCurrent >= iEnd) {
136         if (endReached)
137            return -1;
138
139         // If there's still space at the end of this buffer, fill it.
140         // Make sure there's at least 2 character spaces free for extended unicode characters.
141         //if (false) {
142         if (iEnd+1 < buff.length) {
143            int x = read(buff, iCurrent, buff.length-iEnd);
144            if (x == -1) {
145               endReached = true;
146               return -1;
147            }
148            iEnd += x;
149
150         } else {
151            // If we're currently marking, then we want to copy from the current mark point
152            // to the beginning of the buffer and then fill in the remainder of buffer.
153            if (iMark >= 0) {
154
155               // If we're marking from the beginning of the array, we double the size of the
156               // buffer.  This isn't likely to occur often.
157               if (iMark == 0) {
158                  char[] buff2 = new char[buff.length<<1];
159                  System.arraycopy(buff, 0, buff2, 0, buff.length);
160                  buff = buff2;
161
162               // Otherwise, we copy what's currently marked to the beginning of the buffer.
163               } else {
164                  int copyBuff = iMark;
165                  System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff);
166                  iCurrent -= copyBuff;
167                  iMark -= copyBuff;
168               }
169               int expected = buff.length - iCurrent;
170
171               int x = read(buff, iCurrent, expected);
172               if (x == -1) {
173                  endReached = true;
174                  iEnd = iCurrent;
175                  return -1;
176               }
177               iEnd = iCurrent + x;
178            } else {
179               // Copy the last 10 chars in the buffer to the beginning of the buffer.
180               int copyBuff = Math.min(iCurrent, 10);
181               System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff);
182
183               // Number of characters we expect to copy on the next read.
184               int expected = buff.length - copyBuff;
185               int x = read(buff, copyBuff, expected);
186               iCurrent = copyBuff;
187               if (x == -1) {
188                  endReached = true;
189                  iEnd = iCurrent;
190                  return -1;
191               }
192               iEnd = iCurrent + x;
193            }
194         }
195      }
196      return buff[iCurrent++];
197   }
198
199   /**
200    * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}.
201    */
202   public final void mark() {
203      iMark = iCurrent;
204   }
205
206   /**
207    * Peeks the next character in the stream.
208    *
209    * <p>
210    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
211    *
212    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
213    * @throws IOException If a problem occurred trying to read from the reader.
214    */
215   public final int peek() throws IOException {
216      int c = read();
217      if (c != -1)
218         unread();
219      return c;
220   }
221
222   /**
223    * Same as {@link #peek()} but skips over any whitespace characters.
224    *
225    * <p>
226    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
227    *
228    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
229    * @throws IOException If a problem occurred trying to read from the reader.
230    */
231   public final int peekSkipWs() throws IOException {
232      while(true) {
233         int c = read();
234         boolean isWs = Character.isWhitespace(c);
235         if (c != -1 && ! isWs)
236            unread();
237         if (! isWs)
238            return c;
239      }
240   }
241
242   /**
243    * Read the specified number of characters off the stream.
244    *
245    * @param num The number of characters to read.
246    * @return The characters packaged as a String.
247    * @throws IOException If a problem occurred trying to read from the reader.
248    */
249   public final String read(int num) throws IOException {
250      char[] c = new char[num];
251      for (int i = 0; i < num; i++) {
252         int c2 = read();
253         if (c2 == -1)
254            return new String(c, 0, i);
255         c[i] = (char)c2;
256      }
257      return new String(c);
258   }
259
260   /**
261    * Pushes the last read character back into the stream.
262    *
263    * @return This object.
264    * @throws IOException If a problem occurred trying to read from the reader.
265    */
266   public ParserReader unread() throws IOException {
267      if (iCurrent <= 0)
268         throw new IOException("Buffer underflow.");
269      iCurrent--;
270      if (column == 0)
271         line--;
272      else
273         column--;
274      return this;
275   }
276
277   /**
278    * No-op.
279    *
280    * <p>
281    * Input readers are closed in the {@link ParserPipe} class.
282    *
283    * @throws IOException If a problem occurred trying to read from the reader.
284    */
285   @Override /* Reader */
286   public void close() throws IOException {
287      // No-op
288   }
289
290   /**
291    * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage.
292    *
293    * @return The contents of the reusable character buffer as a string.
294    */
295   public final String getMarked() {
296      return getMarked(0, 0);
297   }
298
299   /**
300    * Same as {@link #getMarked()} except allows you to specify offsets into the buffer.
301    *
302    * <p>
303    * For example, to return the marked string, but trim the first and last characters, call the following:
304    * <p class='bjava'>
305    *    getFromMarked(1, -1);
306    * </p>
307    *
308    * @param offsetStart The offset of the start position.
309    * @param offsetEnd The offset of the end position.
310    * @return The contents of the reusable character buffer as a string.
311    */
312   public final String getMarked(int offsetStart, int offsetEnd) {
313      int offset = 0;
314
315      // Holes are \u00FF 'delete' characters that we need to get rid of now.
316      if (holesExist) {
317         for (int i = iMark; i < iCurrent; i++) {
318            char c = buff[i];
319            if (c == 127)
320               offset++;
321            else
322               buff[i-offset] = c;
323         }
324         holesExist = false;
325      }
326      int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset;
327      String s = new String(buff, start, len);
328      iMark = -1;
329      return s;
330   }
331
332   /**
333    * Trims off the last character in the marking buffer.
334    *
335    * <p>
336    * Useful for removing escape characters from sequences.
337    *
338    * @return This object.
339    */
340   public final ParserReader delete() {
341      return delete(1);
342   }
343
344   /**
345    * Trims off the specified number of last characters in the marking buffer.
346    * Useful for removing escape characters from sequences.
347    *
348    * @param count The number of characters to delete.
349    * @return This object.
350    */
351   public final ParserReader delete(int count) {
352      for (int i = 0; i < count; i++)
353         buff[iCurrent-i-1] = 127;
354      holesExist = true;
355      return this;
356   }
357
358   /**
359    * Replaces the last character in the marking buffer with the specified character.
360    *
361    * <p>
362    * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended
363    * unicode characters in order for the replacement to fit into the buffer.
364    *
365    * @param c The new character.
366    * @param offset The offset.
367    * @return This object.
368    * @throws IOException Thrown by underlying stream.
369    */
370   public final ParserReader replace(int c, int offset) throws IOException {
371      if (c < 0x10000) {
372         if (offset < 1)
373            throw new IOException("Buffer underflow.");
374         buff[iCurrent-offset] = (char)c;
375      } else {
376         if (offset < 2)
377            throw new IOException("Buffer underflow.");
378         c -= 0x10000;
379         buff[iCurrent-offset] = (char)(0xd800 + (c >> 10));
380         buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff));
381         offset--;
382      }
383      // Fill in the gap with DEL characters.
384      for (int i = 1; i < offset; i++)
385         buff[iCurrent-i] = 127;
386      holesExist |= (offset > 1);
387      return this;
388   }
389
390   /**
391    * Replace the last read character in the buffer with the specified character.
392    *
393    * @param c The new character.
394    * @return This object.
395    * @throws IOException Thrown by underlying stream.
396    */
397   public final ParserReader replace(char c) throws IOException {
398      return replace(c, 1);
399   }
400   /**
401    * Reads a numeric string from the specified reader.
402    *
403    * @return The parsed number string.
404    * @throws IOException Thrown by underlying stream.
405    */
406   public String parseNumberString() throws IOException {
407      mark();
408      int c = 0;
409      while (true) {
410         c = read();
411         if (c == -1)
412            break;
413         if (! StringUtils.isNumberChar((char)c)) {
414            unread();
415            break;
416         }
417      }
418      return getMarked();
419   }
420
421   /**
422    * Subclasses can override this method to provide additional filtering.
423    *
424    * <p>
425    * Default implementation simply calls the same method on the underlying reader.
426    */
427   @Override /* Reader */
428   public int read(char[] cbuf, int off, int len) throws IOException {
429      return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len);
430   }
431
432   @Override /* Positionable */
433   public Position getPosition() {
434      return new Position(line, column);
435   }
436}