001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.juneau.parser; 018 019import java.io.*; 020 021import org.apache.juneau.common.utils.*; 022import org.apache.juneau.internal.*; 023 024/** 025 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character. 026 * 027 * <p> 028 * Code is optimized to work with a 1 character buffer. 029 * 030 * <p> 031 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture 032 * characters from the previous mark point. 033 * 034 * <h5 class='section'>Notes:</h5><ul> 035 * <li class='warn'>This class is not thread safe. 036 * </ul> 037 * 038 * <h5 class='section'>See Also:</h5><ul> 039 * <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/SerializersAndParsers">Serializers and Parsers</a> 040 041 * </ul> 042 */ 043public class ParserReader extends Reader implements Positionable { 044 045 /** Wrapped reader */ 046 protected final Reader r; 047 048 private char[] buff; // Internal character buffer 049 private int line = 1; // Current line number 050 private int column; // Current column number 051 private int iCurrent; // Current pointer into character buffer 052 private int iMark = -1; // Mark position in buffer 053 private int iEnd; // The last good character position in the buffer 054 private boolean endReached, holesExist; 055 private final boolean unbuffered; 056 057 /** 058 * Constructor. 059 * 060 * @param pipe The parser input. 061 * @throws IOException Thrown by underlying stream. 062 */ 063 public ParserReader(ParserPipe pipe) throws IOException { 064 this.unbuffered = pipe.unbuffered; 065 if (pipe.isString()) { 066 String in = pipe.getInputAsString(); 067 this.r = new CharSequenceReader(in); 068 this.buff = new char[in.length() < 1024 ? in.length() : 1024]; 069 } else { 070 Reader _r = pipe.getReader(); 071 if (_r instanceof ParserReader) 072 this.r = ((ParserReader)_r).r; 073 else 074 this.r = _r; 075 this.buff = new char[1024]; 076 } 077 pipe.setPositionable(this); 078 } 079 080 /** 081 * Reads a single character. 082 * 083 * <p> 084 * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather 085 * returns them as two <jk>char</jk>s. 086 * Use {@link #readCodePoint()} to ensure proper handling of extended unicode. 087 * 088 * @return The character read, or -1 if the end of the stream has been reached. 089 * @throws IOException If a problem occurred trying to read from the reader. 090 */ 091 @Override /* Reader */ 092 public final int read() throws IOException { 093 int c = readFromBuff(); 094 if (c == -1) 095 return -1; 096 if (c == '\n') { 097 line++; 098 column = 0; 099 } else { 100 column++; 101 } 102 return c; 103 } 104 105 /** 106 * Same as {@link #read()} but skips over any whitespace characters. 107 * 108 * @return The first non-whitespace character, or -1 if the end of stream reached. 109 * @throws IOException Thrown by underlying stream. 110 */ 111 public final int readSkipWs() throws IOException { 112 while (true) { 113 int c = read(); 114 if (c == -1 || ! Character.isWhitespace(c)) 115 return c; 116 } 117 } 118 119 /** 120 * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000). 121 * 122 * @return The character read, or -1 if the end of the stream has been reached. 123 * @throws IOException If a problem occurred trying to read from the reader. 124 */ 125 public final int readCodePoint() throws IOException { 126 int c = read(); 127 128 // Characters that take up 2 chars. 129 if (c >= 0xd800 && c <= 0xdbff) { 130 int low = read(); 131 if (low >= 0xdc00 && low <= 0xdfff) 132 c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00); 133 } 134 135 return c; 136 } 137 138 private final int readFromBuff() throws IOException { 139 while (iCurrent >= iEnd) { 140 if (endReached) 141 return -1; 142 143 // If there's still space at the end of this buffer, fill it. 144 // Make sure there's at least 2 character spaces free for extended unicode characters. 145 //if (false) { 146 if (iEnd+1 < buff.length) { 147 int x = read(buff, iCurrent, buff.length-iEnd); 148 if (x == -1) { 149 endReached = true; 150 return -1; 151 } 152 iEnd += x; 153 154 } else { 155 // If we're currently marking, then we want to copy from the current mark point 156 // to the beginning of the buffer and then fill in the remainder of buffer. 157 if (iMark >= 0) { 158 159 // If we're marking from the beginning of the array, we double the size of the 160 // buffer. This isn't likely to occur often. 161 if (iMark == 0) { 162 char[] buff2 = new char[buff.length<<1]; 163 System.arraycopy(buff, 0, buff2, 0, buff.length); 164 buff = buff2; 165 166 // Otherwise, we copy what's currently marked to the beginning of the buffer. 167 } else { 168 int copyBuff = iMark; 169 System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff); 170 iCurrent -= copyBuff; 171 iMark -= copyBuff; 172 } 173 int expected = buff.length - iCurrent; 174 175 int x = read(buff, iCurrent, expected); 176 if (x == -1) { 177 endReached = true; 178 iEnd = iCurrent; 179 return -1; 180 } 181 iEnd = iCurrent + x; 182 } else { 183 // Copy the last 10 chars in the buffer to the beginning of the buffer. 184 int copyBuff = Math.min(iCurrent, 10); 185 System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff); 186 187 // Number of characters we expect to copy on the next read. 188 int expected = buff.length - copyBuff; 189 int x = read(buff, copyBuff, expected); 190 iCurrent = copyBuff; 191 if (x == -1) { 192 endReached = true; 193 iEnd = iCurrent; 194 return -1; 195 } 196 iEnd = iCurrent + x; 197 } 198 } 199 } 200 return buff[iCurrent++]; 201 } 202 203 /** 204 * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}. 205 */ 206 public final void mark() { 207 iMark = iCurrent; 208 } 209 210 /** 211 * Peeks the next character in the stream. 212 * 213 * <p> 214 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 215 * 216 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 217 * @throws IOException If a problem occurred trying to read from the reader. 218 */ 219 public final int peek() throws IOException { 220 int c = read(); 221 if (c != -1) 222 unread(); 223 return c; 224 } 225 226 /** 227 * Same as {@link #peek()} but skips over any whitespace characters. 228 * 229 * <p> 230 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 231 * 232 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 233 * @throws IOException If a problem occurred trying to read from the reader. 234 */ 235 public final int peekSkipWs() throws IOException { 236 while(true) { 237 int c = read(); 238 boolean isWs = Character.isWhitespace(c); 239 if (c != -1 && ! isWs) 240 unread(); 241 if (! isWs) 242 return c; 243 } 244 } 245 246 /** 247 * Read the specified number of characters off the stream. 248 * 249 * @param num The number of characters to read. 250 * @return The characters packaged as a String. 251 * @throws IOException If a problem occurred trying to read from the reader. 252 */ 253 public final String read(int num) throws IOException { 254 char[] c = new char[num]; 255 for (int i = 0; i < num; i++) { 256 int c2 = read(); 257 if (c2 == -1) 258 return new String(c, 0, i); 259 c[i] = (char)c2; 260 } 261 return new String(c); 262 } 263 264 /** 265 * Pushes the last read character back into the stream. 266 * 267 * @return This object. 268 * @throws IOException If a problem occurred trying to read from the reader. 269 */ 270 public ParserReader unread() throws IOException { 271 if (iCurrent <= 0) 272 throw new IOException("Buffer underflow."); 273 iCurrent--; 274 if (column == 0) 275 line--; 276 else 277 column--; 278 return this; 279 } 280 281 /** 282 * No-op. 283 * 284 * <p> 285 * Input readers are closed in the {@link ParserPipe} class. 286 * 287 * @throws IOException If a problem occurred trying to read from the reader. 288 */ 289 @Override /* Reader */ 290 public void close() throws IOException { 291 // No-op 292 } 293 294 /** 295 * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage. 296 * 297 * @return The contents of the reusable character buffer as a string. 298 */ 299 public final String getMarked() { 300 return getMarked(0, 0); 301 } 302 303 /** 304 * Same as {@link #getMarked()} except allows you to specify offsets into the buffer. 305 * 306 * <p> 307 * For example, to return the marked string, but trim the first and last characters, call the following: 308 * <p class='bjava'> 309 * getFromMarked(1, -1); 310 * </p> 311 * 312 * @param offsetStart The offset of the start position. 313 * @param offsetEnd The offset of the end position. 314 * @return The contents of the reusable character buffer as a string. 315 */ 316 public final String getMarked(int offsetStart, int offsetEnd) { 317 int offset = 0; 318 319 // Holes are \u00FF 'delete' characters that we need to get rid of now. 320 if (holesExist) { 321 for (int i = iMark; i < iCurrent; i++) { 322 char c = buff[i]; 323 if (c == 127) 324 offset++; 325 else 326 buff[i-offset] = c; 327 } 328 holesExist = false; 329 } 330 int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset; 331 String s = new String(buff, start, len); 332 iMark = -1; 333 return s; 334 } 335 336 /** 337 * Trims off the last character in the marking buffer. 338 * 339 * <p> 340 * Useful for removing escape characters from sequences. 341 * 342 * @return This object. 343 */ 344 public final ParserReader delete() { 345 return delete(1); 346 } 347 348 /** 349 * Trims off the specified number of last characters in the marking buffer. 350 * Useful for removing escape characters from sequences. 351 * 352 * @param count The number of characters to delete. 353 * @return This object. 354 */ 355 public final ParserReader delete(int count) { 356 for (int i = 0; i < count; i++) 357 buff[iCurrent-i-1] = 127; 358 holesExist = true; 359 return this; 360 } 361 362 /** 363 * Replaces the last character in the marking buffer with the specified character. 364 * 365 * <p> 366 * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended 367 * unicode characters in order for the replacement to fit into the buffer. 368 * 369 * @param c The new character. 370 * @param offset The offset. 371 * @return This object. 372 * @throws IOException Thrown by underlying stream. 373 */ 374 public final ParserReader replace(int c, int offset) throws IOException { 375 if (c < 0x10000) { 376 if (offset < 1) 377 throw new IOException("Buffer underflow."); 378 buff[iCurrent-offset] = (char)c; 379 } else { 380 if (offset < 2) 381 throw new IOException("Buffer underflow."); 382 c -= 0x10000; 383 buff[iCurrent-offset] = (char)(0xd800 + (c >> 10)); 384 buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff)); 385 offset--; 386 } 387 // Fill in the gap with DEL characters. 388 for (int i = 1; i < offset; i++) 389 buff[iCurrent-i] = 127; 390 holesExist |= (offset > 1); 391 return this; 392 } 393 394 /** 395 * Replace the last read character in the buffer with the specified character. 396 * 397 * @param c The new character. 398 * @return This object. 399 * @throws IOException Thrown by underlying stream. 400 */ 401 public final ParserReader replace(char c) throws IOException { 402 return replace(c, 1); 403 } 404 /** 405 * Reads a numeric string from the specified reader. 406 * 407 * @return The parsed number string. 408 * @throws IOException Thrown by underlying stream. 409 */ 410 public String parseNumberString() throws IOException { 411 mark(); 412 int c = 0; 413 while (true) { 414 c = read(); 415 if (c == -1) 416 break; 417 if (! StringUtils.isNumberChar((char)c)) { 418 unread(); 419 break; 420 } 421 } 422 return getMarked(); 423 } 424 425 /** 426 * Subclasses can override this method to provide additional filtering. 427 * 428 * <p> 429 * Default implementation simply calls the same method on the underlying reader. 430 */ 431 @Override /* Reader */ 432 public int read(char[] cbuf, int off, int len) throws IOException { 433 return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len); 434 } 435 436 @Override /* Positionable */ 437 public Position getPosition() { 438 return new Position(line, column); 439 } 440}