1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.juneau.parser;
18
19 import static org.apache.juneau.commons.utils.StringUtils.*;
20 import static org.apache.juneau.commons.utils.ThrowableUtils.*;
21
22 import java.io.*;
23
24 import org.apache.juneau.commons.io.*;
25
26 /**
27 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character.
28 *
29 * <p>
30 * Code is optimized to work with a 1 character buffer.
31 *
32 * <p>
33 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture
34 * characters from the previous mark point.
35 *
36 * <h5 class='section'>Notes:</h5><ul>
37 * <li class='warn'>This class is not thread safe.
38 * </ul>
39 *
40 * <h5 class='section'>See Also:</h5><ul>
41 * <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/SerializersAndParsers">Serializers and Parsers</a>
42
43 * </ul>
44 */
45 @SuppressWarnings("resource")
46 public class ParserReader extends Reader implements Positionable {
47
48 /** Wrapped reader */
49 protected final Reader r;
50
51 private char[] buff; // Internal character buffer
52 private int line = 1; // Current line number
53 private int column; // Current column number
54 private int iCurrent; // Current pointer into character buffer
55 private int iMark = -1; // Mark position in buffer
56 private int iEnd; // The last good character position in the buffer
57 private boolean endReached, holesExist;
58 private final boolean unbuffered;
59
60 /**
61 * Constructor.
62 *
63 * @param pipe The parser input.
64 * @throws IOException Thrown by underlying stream.
65 */
66 public ParserReader(ParserPipe pipe) throws IOException {
67 this.unbuffered = pipe.unbuffered;
68 if (pipe.isString()) {
69 String in = pipe.getInputAsString();
70 this.r = new CharSequenceReader(in);
71 this.buff = new char[in.length() < 1024 ? in.length() : 1024];
72 } else {
73 Reader _r = pipe.getReader();
74 if (_r instanceof ParserReader _r2)
75 this.r = _r2.r;
76 else
77 this.r = _r;
78 this.buff = new char[1024];
79 }
80 pipe.setPositionable(this);
81 }
82
83 /**
84 * No-op.
85 *
86 * <p>
87 * Input readers are closed in the {@link ParserPipe} class.
88 *
89 * @throws IOException If a problem occurred trying to read from the reader.
90 */
91 @Override /* Overridden from Reader */
92 public void close() throws IOException {
93 // No-op
94 }
95
96 /**
97 * Trims off the last character in the marking buffer.
98 *
99 * <p>
100 * Useful for removing escape characters from sequences.
101 *
102 * @return This object.
103 */
104 public final ParserReader delete() {
105 return delete(1);
106 }
107
108 /**
109 * Trims off the specified number of last characters in the marking buffer.
110 * Useful for removing escape characters from sequences.
111 *
112 * @param count The number of characters to delete.
113 * @return This object.
114 */
115 public final ParserReader delete(int count) {
116 for (var i = 0; i < count; i++)
117 buff[iCurrent - i - 1] = 127;
118 holesExist = true;
119 return this;
120 }
121
122 /**
123 * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage.
124 *
125 * @return The contents of the reusable character buffer as a string.
126 */
127 public final String getMarked() { return getMarked(0, 0); }
128
129 /**
130 * Same as {@link #getMarked()} except allows you to specify offsets into the buffer.
131 *
132 * <p>
133 * For example, to return the marked string, but trim the first and last characters, call the following:
134 * <p class='bjava'>
135 * getFromMarked(1, -1);
136 * </p>
137 *
138 * @param offsetStart The offset of the start position.
139 * @param offsetEnd The offset of the end position.
140 * @return The contents of the reusable character buffer as a string.
141 */
142 public final String getMarked(int offsetStart, int offsetEnd) {
143 int offset = 0;
144
145 // Holes are \u00FF 'delete' characters that we need to get rid of now.
146 if (holesExist) {
147 for (var i = iMark; i < iCurrent; i++) {
148 char c = buff[i];
149 if (c == 127)
150 offset++;
151 else
152 buff[i - offset] = c;
153 }
154 holesExist = false;
155 }
156 int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset;
157 var s = new String(buff, start, len);
158 iMark = -1;
159 return s;
160 }
161
162 @Override /* Overridden from Positionable */
163 public Position getPosition() { return new Position(line, column); }
164
165 /**
166 * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}.
167 */
168 public final void mark() {
169 iMark = iCurrent;
170 }
171
172 /**
173 * Reads a numeric string from the specified reader.
174 *
175 * @return The parsed number string.
176 * @throws IOException Thrown by underlying stream.
177 */
178 public String parseNumberString() throws IOException {
179 mark();
180 int c = 0;
181 while (true) {
182 c = read();
183 if (c == -1)
184 break;
185 if (! isNumberChar((char)c)) {
186 unread();
187 break;
188 }
189 }
190 return getMarked();
191 }
192
193 /**
194 * Peeks the next character in the stream.
195 *
196 * <p>
197 * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
198 *
199 * @return The peeked character, or (char)-1 if the end of the stream has been reached.
200 * @throws IOException If a problem occurred trying to read from the reader.
201 */
202 public final int peek() throws IOException {
203 int c = read();
204 if (c != -1)
205 unread();
206 return c;
207 }
208
209 /**
210 * Same as {@link #peek()} but skips over any whitespace characters.
211 *
212 * <p>
213 * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
214 *
215 * @return The peeked character, or (char)-1 if the end of the stream has been reached.
216 * @throws IOException If a problem occurred trying to read from the reader.
217 */
218 public final int peekSkipWs() throws IOException {
219 while (true) {
220 var c = read();
221 var isWs = Character.isWhitespace(c);
222 if (c != -1 && ! isWs)
223 unread();
224 if (! isWs)
225 return c;
226 }
227 }
228
229 /**
230 * Reads a single character.
231 *
232 * <p>
233 * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather
234 * returns them as two <jk>char</jk>s.
235 * Use {@link #readCodePoint()} to ensure proper handling of extended unicode.
236 *
237 * @return The character read, or -1 if the end of the stream has been reached.
238 * @throws IOException If a problem occurred trying to read from the reader.
239 */
240 @Override /* Overridden from Reader */
241 public final int read() throws IOException {
242 int c = readFromBuff();
243 if (c == -1)
244 return -1;
245 if (c == '\n') {
246 line++;
247 column = 0;
248 } else {
249 column++;
250 }
251 return c;
252 }
253
254 /**
255 * Subclasses can override this method to provide additional filtering.
256 *
257 * <p>
258 * Default implementation simply calls the same method on the underlying reader.
259 */
260 @Override /* Overridden from Reader */
261 public int read(char[] cbuf, int off, int len) throws IOException {
262 return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len);
263 }
264
265 /**
266 * Read the specified number of characters off the stream.
267 *
268 * @param num The number of characters to read.
269 * @return The characters packaged as a String.
270 * @throws IOException If a problem occurred trying to read from the reader.
271 */
272 public final String read(int num) throws IOException {
273 var c = new char[num];
274 for (var i = 0; i < num; i++) {
275 var c2 = read();
276 if (c2 == -1)
277 return new String(c, 0, i);
278 c[i] = (char)c2;
279 }
280 return new String(c);
281 }
282
283 /**
284 * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000).
285 *
286 * @return The character read, or -1 if the end of the stream has been reached.
287 * @throws IOException If a problem occurred trying to read from the reader.
288 */
289 public final int readCodePoint() throws IOException {
290 int c = read();
291
292 // Characters that take up 2 chars.
293 if (c >= 0xd800 && c <= 0xdbff) {
294 var low = read();
295 if (low >= 0xdc00 && low <= 0xdfff)
296 c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00);
297 }
298
299 return c;
300 }
301
302 /**
303 * Same as {@link #read()} but skips over any whitespace characters.
304 *
305 * @return The first non-whitespace character, or -1 if the end of stream reached.
306 * @throws IOException Thrown by underlying stream.
307 */
308 public final int readSkipWs() throws IOException {
309 while (true) {
310 var c = read();
311 if (c == -1 || ! Character.isWhitespace(c))
312 return c;
313 }
314 }
315
316 /**
317 * Replace the last read character in the buffer with the specified character.
318 *
319 * @param c The new character.
320 * @return This object.
321 * @throws IOException Thrown by underlying stream.
322 */
323 public final ParserReader replace(char c) throws IOException {
324 return replace(c, 1);
325 }
326
327 /**
328 * Replaces the last character in the marking buffer with the specified character.
329 *
330 * <p>
331 * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended
332 * unicode characters in order for the replacement to fit into the buffer.
333 *
334 * @param c The new character.
335 * @param offset The offset.
336 * @return This object.
337 * @throws IOException Thrown by underlying stream.
338 */
339 public final ParserReader replace(int c, int offset) throws IOException {
340 if (c < 0x10000) {
341 if (offset < 1)
342 throw ioex("Buffer underflow.");
343 buff[iCurrent - offset] = (char)c;
344 } else {
345 if (offset < 2)
346 throw ioex("Buffer underflow.");
347 c -= 0x10000;
348 buff[iCurrent - offset] = (char)(0xd800 + (c >> 10));
349 buff[iCurrent - offset + 1] = (char)(0xdc00 + (c & 0x3ff));
350 offset--;
351 }
352 // Fill in the gap with DEL characters.
353 for (var i = 1; i < offset; i++)
354 buff[iCurrent - i] = 127;
355 holesExist |= (offset > 1);
356 return this;
357 }
358
359 /**
360 * Pushes the last read character back into the stream.
361 *
362 * @return This object.
363 * @throws IOException If a problem occurred trying to read from the reader.
364 */
365 public ParserReader unread() throws IOException {
366 if (iCurrent <= 0)
367 throw ioex("Buffer underflow.");
368 iCurrent--;
369 if (column == 0)
370 line--;
371 else
372 column--;
373 return this;
374 }
375
376 private final int readFromBuff() throws IOException {
377 while (iCurrent >= iEnd) {
378 if (endReached)
379 return -1;
380
381 // If there's still space at the end of this buffer, fill it.
382 // Make sure there's at least 2 character spaces free for extended unicode characters.
383 if (iEnd + 1 < buff.length) {
384 int x = read(buff, iCurrent, buff.length - iEnd);
385 if (x == -1) {
386 endReached = true;
387 return -1;
388 }
389 iEnd += x;
390
391 } else {
392 // If we're currently marking, then we want to copy from the current mark point
393 // to the beginning of the buffer and then fill in the remainder of buffer.
394 if (iMark >= 0) {
395
396 // If we're marking from the beginning of the array, we double the size of the
397 // buffer. This isn't likely to occur often.
398 if (iMark == 0) {
399 var buff2 = new char[buff.length << 1];
400 System.arraycopy(buff, 0, buff2, 0, buff.length);
401 buff = buff2;
402
403 // Otherwise, we copy what's currently marked to the beginning of the buffer.
404 } else {
405 int copyBuff = iMark;
406 System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff);
407 iCurrent -= copyBuff;
408 iMark -= copyBuff;
409 }
410 int expected = buff.length - iCurrent;
411
412 int x = read(buff, iCurrent, expected);
413 if (x == -1) {
414 endReached = true;
415 iEnd = iCurrent;
416 return -1;
417 }
418 iEnd = iCurrent + x;
419 } else {
420 // Copy the last 10 chars in the buffer to the beginning of the buffer.
421 int copyBuff = Math.min(iCurrent, 10);
422 System.arraycopy(buff, iCurrent - copyBuff, buff, 0, copyBuff);
423
424 // Number of characters we expect to copy on the next read.
425 int expected = buff.length - copyBuff;
426 int x = read(buff, copyBuff, expected);
427 iCurrent = copyBuff;
428 if (x == -1) {
429 endReached = true;
430 iEnd = iCurrent;
431 return -1;
432 }
433 iEnd = iCurrent + x;
434 }
435 }
436 }
437 return buff[iCurrent++];
438 }
439 }