001// *************************************************************************************************************************** 002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file * 003// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * 004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance * 005// * with the License. You may obtain a copy of the License at * 006// * * 007// * http://www.apache.org/licenses/LICENSE-2.0 * 008// * * 009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an * 010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * 011// * specific language governing permissions and limitations under the License. * 012// *************************************************************************************************************************** 013package org.apache.juneau.parser; 014 015import java.io.*; 016 017import org.apache.juneau.common.internal.*; 018import org.apache.juneau.internal.*; 019 020/** 021 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character. 022 * 023 * <p> 024 * Code is optimized to work with a 1 character buffer. 025 * 026 * <p> 027 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture 028 * characters from the previous mark point. 029 * 030 * <h5 class='section'>Notes:</h5><ul> 031 * <li class='warn'>This class is not thread safe. 032 * </ul> 033 * 034 * <h5 class='section'>See Also:</h5><ul> 035 * <li class='link'><a class="doclink" href="../../../../index.html#jm.SerializersAndParsers">Serializers and Parsers</a> 036 037 * </ul> 038 */ 039public class ParserReader extends Reader implements Positionable { 040 041 /** Wrapped reader */ 042 protected final Reader r; 043 044 private char[] buff; // Internal character buffer 045 private int line = 1; // Current line number 046 private int column; // Current column number 047 private int iCurrent = 0; // Current pointer into character buffer 048 private int iMark = -1; // Mark position in buffer 049 private int iEnd = 0; // The last good character position in the buffer 050 private boolean endReached, holesExist; 051 private final boolean unbuffered; 052 053 /** 054 * Constructor. 055 * 056 * @param pipe The parser input. 057 * @throws IOException Thrown by underlying stream. 058 */ 059 public ParserReader(ParserPipe pipe) throws IOException { 060 this.unbuffered = pipe.unbuffered; 061 if (pipe.isString()) { 062 String in = pipe.getInputAsString(); 063 this.r = new CharSequenceReader(in); 064 this.buff = new char[in.length() < 1024 ? in.length() : 1024]; 065 } else { 066 Reader _r = pipe.getReader(); 067 if (_r instanceof ParserReader) 068 this.r = ((ParserReader)_r).r; 069 else 070 this.r = _r; 071 this.buff = new char[1024]; 072 } 073 pipe.setPositionable(this); 074 } 075 076 /** 077 * Reads a single character. 078 * 079 * <p> 080 * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather 081 * returns them as two <jk>char</jk>s. 082 * Use {@link #readCodePoint()} to ensure proper handling of extended unicode. 083 * 084 * @return The character read, or -1 if the end of the stream has been reached. 085 * @throws IOException If a problem occurred trying to read from the reader. 086 */ 087 @Override /* Reader */ 088 public final int read() throws IOException { 089 int c = readFromBuff(); 090 if (c == -1) 091 return -1; 092 if (c == '\n') { 093 line++; 094 column = 0; 095 } else { 096 column++; 097 } 098 return c; 099 } 100 101 /** 102 * Same as {@link #read()} but skips over any whitespace characters. 103 * 104 * @return The first non-whitespace character, or -1 if the end of stream reached. 105 * @throws IOException Thrown by underlying stream. 106 */ 107 public final int readSkipWs() throws IOException { 108 while (true) { 109 int c = read(); 110 if (c == -1 || ! Character.isWhitespace(c)) 111 return c; 112 } 113 } 114 115 /** 116 * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000). 117 * 118 * @return The character read, or -1 if the end of the stream has been reached. 119 * @throws IOException If a problem occurred trying to read from the reader. 120 */ 121 public final int readCodePoint() throws IOException { 122 int c = read(); 123 124 // Characters that take up 2 chars. 125 if (c >= 0xd800 && c <= 0xdbff) { 126 int low = read(); 127 if (low >= 0xdc00 && low <= 0xdfff) 128 c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00); 129 } 130 131 return c; 132 } 133 134 private final int readFromBuff() throws IOException { 135 while (iCurrent >= iEnd) { 136 if (endReached) 137 return -1; 138 139 // If there's still space at the end of this buffer, fill it. 140 // Make sure there's at least 2 character spaces free for extended unicode characters. 141 //if (false) { 142 if (iEnd+1 < buff.length) { 143 int x = read(buff, iCurrent, buff.length-iEnd); 144 if (x == -1) { 145 endReached = true; 146 return -1; 147 } 148 iEnd += x; 149 150 } else { 151 // If we're currently marking, then we want to copy from the current mark point 152 // to the beginning of the buffer and then fill in the remainder of buffer. 153 if (iMark >= 0) { 154 155 // If we're marking from the beginning of the array, we double the size of the 156 // buffer. This isn't likely to occur often. 157 if (iMark == 0) { 158 char[] buff2 = new char[buff.length<<1]; 159 System.arraycopy(buff, 0, buff2, 0, buff.length); 160 buff = buff2; 161 162 // Otherwise, we copy what's currently marked to the beginning of the buffer. 163 } else { 164 int copyBuff = iMark; 165 System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff); 166 iCurrent -= copyBuff; 167 iMark -= copyBuff; 168 } 169 int expected = buff.length - iCurrent; 170 171 int x = read(buff, iCurrent, expected); 172 if (x == -1) { 173 endReached = true; 174 iEnd = iCurrent; 175 return -1; 176 } 177 iEnd = iCurrent + x; 178 } else { 179 // Copy the last 10 chars in the buffer to the beginning of the buffer. 180 int copyBuff = Math.min(iCurrent, 10); 181 System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff); 182 183 // Number of characters we expect to copy on the next read. 184 int expected = buff.length - copyBuff; 185 int x = read(buff, copyBuff, expected); 186 iCurrent = copyBuff; 187 if (x == -1) { 188 endReached = true; 189 iEnd = iCurrent; 190 return -1; 191 } 192 iEnd = iCurrent + x; 193 } 194 } 195 } 196 return buff[iCurrent++]; 197 } 198 199 /** 200 * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}. 201 */ 202 public final void mark() { 203 iMark = iCurrent; 204 } 205 206 /** 207 * Peeks the next character in the stream. 208 * 209 * <p> 210 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 211 * 212 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 213 * @throws IOException If a problem occurred trying to read from the reader. 214 */ 215 public final int peek() throws IOException { 216 int c = read(); 217 if (c != -1) 218 unread(); 219 return c; 220 } 221 222 /** 223 * Same as {@link #peek()} but skips over any whitespace characters. 224 * 225 * <p> 226 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 227 * 228 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 229 * @throws IOException If a problem occurred trying to read from the reader. 230 */ 231 public final int peekSkipWs() throws IOException { 232 while(true) { 233 int c = read(); 234 boolean isWs = Character.isWhitespace(c); 235 if (c != -1 && ! isWs) 236 unread(); 237 if (! isWs) 238 return c; 239 } 240 } 241 242 /** 243 * Read the specified number of characters off the stream. 244 * 245 * @param num The number of characters to read. 246 * @return The characters packaged as a String. 247 * @throws IOException If a problem occurred trying to read from the reader. 248 */ 249 public final String read(int num) throws IOException { 250 char[] c = new char[num]; 251 for (int i = 0; i < num; i++) { 252 int c2 = read(); 253 if (c2 == -1) 254 return new String(c, 0, i); 255 c[i] = (char)c2; 256 } 257 return new String(c); 258 } 259 260 /** 261 * Pushes the last read character back into the stream. 262 * 263 * @return This object. 264 * @throws IOException If a problem occurred trying to read from the reader. 265 */ 266 public ParserReader unread() throws IOException { 267 if (iCurrent <= 0) 268 throw new IOException("Buffer underflow."); 269 iCurrent--; 270 if (column == 0) 271 line--; 272 else 273 column--; 274 return this; 275 } 276 277 /** 278 * No-op. 279 * 280 * <p> 281 * Input readers are closed in the {@link ParserPipe} class. 282 * 283 * @throws IOException If a problem occurred trying to read from the reader. 284 */ 285 @Override /* Reader */ 286 public void close() throws IOException { 287 // No-op 288 } 289 290 /** 291 * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage. 292 * 293 * @return The contents of the reusable character buffer as a string. 294 */ 295 public final String getMarked() { 296 return getMarked(0, 0); 297 } 298 299 /** 300 * Same as {@link #getMarked()} except allows you to specify offsets into the buffer. 301 * 302 * <p> 303 * For example, to return the marked string, but trim the first and last characters, call the following: 304 * <p class='bjava'> 305 * getFromMarked(1, -1); 306 * </p> 307 * 308 * @param offsetStart The offset of the start position. 309 * @param offsetEnd The offset of the end position. 310 * @return The contents of the reusable character buffer as a string. 311 */ 312 public final String getMarked(int offsetStart, int offsetEnd) { 313 int offset = 0; 314 315 // Holes are \u00FF 'delete' characters that we need to get rid of now. 316 if (holesExist) { 317 for (int i = iMark; i < iCurrent; i++) { 318 char c = buff[i]; 319 if (c == 127) 320 offset++; 321 else 322 buff[i-offset] = c; 323 } 324 holesExist = false; 325 } 326 int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset; 327 String s = new String(buff, start, len); 328 iMark = -1; 329 return s; 330 } 331 332 /** 333 * Trims off the last character in the marking buffer. 334 * 335 * <p> 336 * Useful for removing escape characters from sequences. 337 * 338 * @return This object. 339 */ 340 public final ParserReader delete() { 341 return delete(1); 342 } 343 344 /** 345 * Trims off the specified number of last characters in the marking buffer. 346 * Useful for removing escape characters from sequences. 347 * 348 * @param count The number of characters to delete. 349 * @return This object. 350 */ 351 public final ParserReader delete(int count) { 352 for (int i = 0; i < count; i++) 353 buff[iCurrent-i-1] = 127; 354 holesExist = true; 355 return this; 356 } 357 358 /** 359 * Replaces the last character in the marking buffer with the specified character. 360 * 361 * <p> 362 * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended 363 * unicode characters in order for the replacement to fit into the buffer. 364 * 365 * @param c The new character. 366 * @param offset The offset. 367 * @return This object. 368 * @throws IOException Thrown by underlying stream. 369 */ 370 public final ParserReader replace(int c, int offset) throws IOException { 371 if (c < 0x10000) { 372 if (offset < 1) 373 throw new IOException("Buffer underflow."); 374 buff[iCurrent-offset] = (char)c; 375 } else { 376 if (offset < 2) 377 throw new IOException("Buffer underflow."); 378 c -= 0x10000; 379 buff[iCurrent-offset] = (char)(0xd800 + (c >> 10)); 380 buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff)); 381 offset--; 382 } 383 // Fill in the gap with DEL characters. 384 for (int i = 1; i < offset; i++) 385 buff[iCurrent-i] = 127; 386 holesExist |= (offset > 1); 387 return this; 388 } 389 390 /** 391 * Replace the last read character in the buffer with the specified character. 392 * 393 * @param c The new character. 394 * @return This object. 395 * @throws IOException Thrown by underlying stream. 396 */ 397 public final ParserReader replace(char c) throws IOException { 398 return replace(c, 1); 399 } 400 /** 401 * Reads a numeric string from the specified reader. 402 * 403 * @return The parsed number string. 404 * @throws IOException Thrown by underlying stream. 405 */ 406 public String parseNumberString() throws IOException { 407 mark(); 408 int c = 0; 409 while (true) { 410 c = read(); 411 if (c == -1) 412 break; 413 if (! StringUtils.isNumberChar((char)c)) { 414 unread(); 415 break; 416 } 417 } 418 return getMarked(); 419 } 420 421 /** 422 * Subclasses can override this method to provide additional filtering. 423 * 424 * <p> 425 * Default implementation simply calls the same method on the underlying reader. 426 */ 427 @Override /* Reader */ 428 public int read(char[] cbuf, int off, int len) throws IOException { 429 return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len); 430 } 431 432 @Override /* Positionable */ 433 public Position getPosition() { 434 return new Position(line, column); 435 } 436}