View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.juneau.parser;
18  
19  import static org.apache.juneau.commons.utils.StringUtils.*;
20  import static org.apache.juneau.commons.utils.ThrowableUtils.*;
21  
22  import java.io.*;
23  
24  import org.apache.juneau.commons.io.*;
25  
26  /**
27   * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character.
28   *
29   * <p>
30   * Code is optimized to work with a 1 character buffer.
31   *
32   * <p>
33   * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture
34   * characters from the previous mark point.
35   *
36   * <h5 class='section'>Notes:</h5><ul>
37   * 	<li class='warn'>This class is not thread safe.
38   * </ul>
39   *
40   * <h5 class='section'>See Also:</h5><ul>
41   * 	<li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/SerializersAndParsers">Serializers and Parsers</a>
42  
43   * </ul>
44   */
45  @SuppressWarnings("resource")
46  public class ParserReader extends Reader implements Positionable {
47  
48  	/** Wrapped reader */
49  	protected final Reader r;
50  
51  	private char[] buff;       // Internal character buffer
52  	private int line = 1;      // Current line number
53  	private int column;        // Current column number
54  	private int iCurrent;      // Current pointer into character buffer
55  	private int iMark = -1;    // Mark position in buffer
56  	private int iEnd;          // The last good character position in the buffer
57  	private boolean endReached, holesExist;
58  	private final boolean unbuffered;
59  
60  	/**
61  	 * Constructor.
62  	 *
63  	 * @param pipe The parser input.
64  	 * @throws IOException Thrown by underlying stream.
65  	 */
66  	public ParserReader(ParserPipe pipe) throws IOException {
67  		this.unbuffered = pipe.unbuffered;
68  		if (pipe.isString()) {
69  			String in = pipe.getInputAsString();
70  			this.r = new CharSequenceReader(in);
71  			this.buff = new char[in.length() < 1024 ? in.length() : 1024];
72  		} else {
73  			Reader _r = pipe.getReader();
74  			if (_r instanceof ParserReader _r2)
75  				this.r = _r2.r;
76  			else
77  				this.r = _r;
78  			this.buff = new char[1024];
79  		}
80  		pipe.setPositionable(this);
81  	}
82  
83  	/**
84  	 * No-op.
85  	 *
86  	 * <p>
87  	 * Input readers are closed in the {@link ParserPipe} class.
88  	 *
89  	 * @throws IOException If a problem occurred trying to read from the reader.
90  	 */
91  	@Override /* Overridden from Reader */
92  	public void close() throws IOException {
93  		// No-op
94  	}
95  
96  	/**
97  	 * Trims off the last character in the marking buffer.
98  	 *
99  	 * <p>
100 	 * Useful for removing escape characters from sequences.
101 	 *
102 	 * @return This object.
103 	 */
104 	public final ParserReader delete() {
105 		return delete(1);
106 	}
107 
108 	/**
109 	 * Trims off the specified number of last characters in the marking buffer.
110 	 * Useful for removing escape characters from sequences.
111 	 *
112 	 * @param count The number of characters to delete.
113 	 * @return This object.
114 	 */
115 	public final ParserReader delete(int count) {
116 		for (var i = 0; i < count; i++)
117 			buff[iCurrent - i - 1] = 127;
118 		holesExist = true;
119 		return this;
120 	}
121 
122 	/**
123 	 * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage.
124 	 *
125 	 * @return The contents of the reusable character buffer as a string.
126 	 */
127 	public final String getMarked() { return getMarked(0, 0); }
128 
129 	/**
130 	 * Same as {@link #getMarked()} except allows you to specify offsets into the buffer.
131 	 *
132 	 * <p>
133 	 * For example, to return the marked string, but trim the first and last characters, call the following:
134 	 * <p class='bjava'>
135 	 * 	getFromMarked(1, -1);
136 	 * </p>
137 	 *
138 	 * @param offsetStart The offset of the start position.
139 	 * @param offsetEnd The offset of the end position.
140 	 * @return The contents of the reusable character buffer as a string.
141 	 */
142 	public final String getMarked(int offsetStart, int offsetEnd) {
143 		int offset = 0;
144 
145 		// Holes are \u00FF 'delete' characters that we need to get rid of now.
146 		if (holesExist) {
147 			for (var i = iMark; i < iCurrent; i++) {
148 				char c = buff[i];
149 				if (c == 127)
150 					offset++;
151 				else
152 					buff[i - offset] = c;
153 			}
154 			holesExist = false;
155 		}
156 		int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset;
157 		var s = new String(buff, start, len);
158 		iMark = -1;
159 		return s;
160 	}
161 
162 	@Override /* Overridden from Positionable */
163 	public Position getPosition() { return new Position(line, column); }
164 
165 	/**
166 	 * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}.
167 	 */
168 	public final void mark() {
169 		iMark = iCurrent;
170 	}
171 
172 	/**
173 	 * Reads a numeric string from the specified reader.
174 	 *
175 	 * @return The parsed number string.
176 	 * @throws IOException Thrown by underlying stream.
177 	 */
178 	public String parseNumberString() throws IOException {
179 		mark();
180 		int c = 0;
181 		while (true) {
182 			c = read();
183 			if (c == -1)
184 				break;
185 			if (! isNumberChar((char)c)) {
186 				unread();
187 				break;
188 			}
189 		}
190 		return getMarked();
191 	}
192 
193 	/**
194 	 * Peeks the next character in the stream.
195 	 *
196 	 * <p>
197 	 * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
198 	 *
199 	 * @return The peeked character, or (char)-1 if the end of the stream has been reached.
200 	 * @throws IOException If a problem occurred trying to read from the reader.
201 	 */
202 	public final int peek() throws IOException {
203 		int c = read();
204 		if (c != -1)
205 			unread();
206 		return c;
207 	}
208 
209 	/**
210 	 * Same as {@link #peek()} but skips over any whitespace characters.
211 	 *
212 	 * <p>
213 	 * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
214 	 *
215 	 * @return The peeked character, or (char)-1 if the end of the stream has been reached.
216 	 * @throws IOException If a problem occurred trying to read from the reader.
217 	 */
218 	public final int peekSkipWs() throws IOException {
219 		while (true) {
220 			var c = read();
221 			var isWs = Character.isWhitespace(c);
222 			if (c != -1 && ! isWs)
223 				unread();
224 			if (! isWs)
225 				return c;
226 		}
227 	}
228 
229 	/**
230 	 * Reads a single character.
231 	 *
232 	 * <p>
233 	 * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather
234 	 * returns them as two <jk>char</jk>s.
235 	 * Use {@link #readCodePoint()} to ensure proper handling of extended unicode.
236 	 *
237 	 * @return The character read, or -1 if the end of the stream has been reached.
238 	 * @throws IOException If a problem occurred trying to read from the reader.
239 	 */
240 	@Override /* Overridden from Reader */
241 	public final int read() throws IOException {
242 		int c = readFromBuff();
243 		if (c == -1)
244 			return -1;
245 		if (c == '\n') {
246 			line++;
247 			column = 0;
248 		} else {
249 			column++;
250 		}
251 		return c;
252 	}
253 
254 	/**
255 	 * Subclasses can override this method to provide additional filtering.
256 	 *
257 	 * <p>
258 	 * Default implementation simply calls the same method on the underlying reader.
259 	 */
260 	@Override /* Overridden from Reader */
261 	public int read(char[] cbuf, int off, int len) throws IOException {
262 		return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len);
263 	}
264 
265 	/**
266 	 * Read the specified number of characters off the stream.
267 	 *
268 	 * @param num The number of characters to read.
269 	 * @return The characters packaged as a String.
270 	 * @throws IOException If a problem occurred trying to read from the reader.
271 	 */
272 	public final String read(int num) throws IOException {
273 		var c = new char[num];
274 		for (var i = 0; i < num; i++) {
275 			var c2 = read();
276 			if (c2 == -1)
277 				return new String(c, 0, i);
278 			c[i] = (char)c2;
279 		}
280 		return new String(c);
281 	}
282 
283 	/**
284 	 * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000).
285 	 *
286 	 * @return The character read, or -1 if the end of the stream has been reached.
287 	 * @throws IOException If a problem occurred trying to read from the reader.
288 	 */
289 	public final int readCodePoint() throws IOException {
290 		int c = read();
291 
292 		// Characters that take up 2 chars.
293 		if (c >= 0xd800 && c <= 0xdbff) {
294 			var low = read();
295 			if (low >= 0xdc00 && low <= 0xdfff)
296 				c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00);
297 		}
298 
299 		return c;
300 	}
301 
302 	/**
303 	 * Same as {@link #read()} but skips over any whitespace characters.
304 	 *
305 	 * @return The first non-whitespace character, or -1 if the end of stream reached.
306 	 * @throws IOException Thrown by underlying stream.
307 	 */
308 	public final int readSkipWs() throws IOException {
309 		while (true) {
310 			var c = read();
311 			if (c == -1 || ! Character.isWhitespace(c))
312 				return c;
313 		}
314 	}
315 
316 	/**
317 	 * Replace the last read character in the buffer with the specified character.
318 	 *
319 	 * @param c The new character.
320 	 * @return This object.
321 	 * @throws IOException Thrown by underlying stream.
322 	 */
323 	public final ParserReader replace(char c) throws IOException {
324 		return replace(c, 1);
325 	}
326 
327 	/**
328 	 * Replaces the last character in the marking buffer with the specified character.
329 	 *
330 	 * <p>
331 	 * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended
332 	 * unicode characters in order for the replacement to fit into the buffer.
333 	 *
334 	 * @param c The new character.
335 	 * @param offset The offset.
336 	 * @return This object.
337 	 * @throws IOException Thrown by underlying stream.
338 	 */
339 	public final ParserReader replace(int c, int offset) throws IOException {
340 		if (c < 0x10000) {
341 			if (offset < 1)
342 				throw ioex("Buffer underflow.");
343 			buff[iCurrent - offset] = (char)c;
344 		} else {
345 			if (offset < 2)
346 				throw ioex("Buffer underflow.");
347 			c -= 0x10000;
348 			buff[iCurrent - offset] = (char)(0xd800 + (c >> 10));
349 			buff[iCurrent - offset + 1] = (char)(0xdc00 + (c & 0x3ff));
350 			offset--;
351 		}
352 		// Fill in the gap with DEL characters.
353 		for (var i = 1; i < offset; i++)
354 			buff[iCurrent - i] = 127;
355 		holesExist |= (offset > 1);
356 		return this;
357 	}
358 
359 	/**
360 	 * Pushes the last read character back into the stream.
361 	 *
362 	 * @return This object.
363 	 * @throws IOException If a problem occurred trying to read from the reader.
364 	 */
365 	public ParserReader unread() throws IOException {
366 		if (iCurrent <= 0)
367 			throw ioex("Buffer underflow.");
368 		iCurrent--;
369 		if (column == 0)
370 			line--;
371 		else
372 			column--;
373 		return this;
374 	}
375 
376 	private final int readFromBuff() throws IOException {
377 		while (iCurrent >= iEnd) {
378 			if (endReached)
379 				return -1;
380 
381 			// If there's still space at the end of this buffer, fill it.
382 			// Make sure there's at least 2 character spaces free for extended unicode characters.
383 			if (iEnd + 1 < buff.length) {
384 				int x = read(buff, iCurrent, buff.length - iEnd);
385 				if (x == -1) {
386 					endReached = true;
387 					return -1;
388 				}
389 				iEnd += x;
390 
391 			} else {
392 				// If we're currently marking, then we want to copy from the current mark point
393 				// to the beginning of the buffer and then fill in the remainder of buffer.
394 				if (iMark >= 0) {
395 
396 					// If we're marking from the beginning of the array, we double the size of the
397 					// buffer.  This isn't likely to occur often.
398 					if (iMark == 0) {
399 						var buff2 = new char[buff.length << 1];
400 						System.arraycopy(buff, 0, buff2, 0, buff.length);
401 						buff = buff2;
402 
403 						// Otherwise, we copy what's currently marked to the beginning of the buffer.
404 					} else {
405 						int copyBuff = iMark;
406 						System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff);
407 						iCurrent -= copyBuff;
408 						iMark -= copyBuff;
409 					}
410 					int expected = buff.length - iCurrent;
411 
412 					int x = read(buff, iCurrent, expected);
413 					if (x == -1) {
414 						endReached = true;
415 						iEnd = iCurrent;
416 						return -1;
417 					}
418 					iEnd = iCurrent + x;
419 				} else {
420 					// Copy the last 10 chars in the buffer to the beginning of the buffer.
421 					int copyBuff = Math.min(iCurrent, 10);
422 					System.arraycopy(buff, iCurrent - copyBuff, buff, 0, copyBuff);
423 
424 					// Number of characters we expect to copy on the next read.
425 					int expected = buff.length - copyBuff;
426 					int x = read(buff, copyBuff, expected);
427 					iCurrent = copyBuff;
428 					if (x == -1) {
429 						endReached = true;
430 						iEnd = iCurrent;
431 						return -1;
432 					}
433 					iEnd = iCurrent + x;
434 				}
435 			}
436 		}
437 		return buff[iCurrent++];
438 	}
439 }