001// *************************************************************************************************************************** 002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file * 003// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * 004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance * 005// * with the License. You may obtain a copy of the License at * 006// * * 007// * http://www.apache.org/licenses/LICENSE-2.0 * 008// * * 009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an * 010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * 011// * specific language governing permissions and limitations under the License. * 012// *************************************************************************************************************************** 013package org.apache.juneau.parser; 014 015import java.io.*; 016 017import org.apache.juneau.*; 018import org.apache.juneau.internal.*; 019 020/** 021 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character. 022 * 023 * <p> 024 * Code is optimized to work with a 1 character buffer. 025 * 026 * <p> 027 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture 028 * characters from the previous mark point. 029 * 030 * <p> 031 * <b>Warning:</b> Not thread safe. 032 */ 033public class ParserReader extends Reader { 034 035 /** Wrapped reader */ 036 protected final Reader r; 037 private final ParserPipe pipe; 038 039 private char[] buff; // Internal character buffer 040 private int line = 1; // Current line number 041 private int column; // Current column number 042 private int iCurrent = 0; // Current pointer into character buffer 043 private int iMark = -1; // Mark position in buffer 044 private int iEnd = 0; // The last good character position in the buffer 045 private boolean endReached, holesExist; 046 private final boolean unbuffered; 047 048 /** 049 * Constructor. 050 * 051 * @param pipe The parser input. 052 * @throws IOException 053 */ 054 public ParserReader(ParserPipe pipe) throws IOException { 055 this.pipe = pipe; 056 this.unbuffered = pipe.unbuffered; 057 if (pipe.isString()) { 058 String in = pipe.getInputAsString(); 059 this.r = new CharSequenceReader(in); 060 this.buff = new char[in.length() < 1024 ? in.length() : 1024]; 061 } else { 062 Reader _r = pipe.getReader(); 063 if (_r instanceof ParserReader) 064 this.r = ((ParserReader)_r).r; 065 else 066 this.r = _r; 067 this.buff = new char[1024]; 068 } 069 } 070 071 072 /** 073 * Returns the current line number position in this reader. 074 * 075 * @return The current line number. 076 */ 077 public final int getLine() { 078 return line; 079 } 080 081 /** 082 * Returns the current column number position in this reader. 083 * 084 * @return The current column number. 085 */ 086 public final int getColumn() { 087 return column; 088 } 089 090 /** 091 * Reads a single character. 092 * 093 * <p> 094 * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather 095 * returns them as two <jk>char</jk>s. 096 * Use {@link #readCodePoint()} to ensure proper handling of extended unicode. 097 * 098 * @return The character read, or -1 if the end of the stream has been reached. 099 * @throws IOException If a problem occurred trying to read from the reader. 100 */ 101 @Override /* Reader */ 102 public final int read() throws IOException { 103 int c = readFromBuff(); 104 if (c == -1) 105 return -1; 106 if (c == '\n') { 107 line++; 108 column = 0; 109 } else { 110 column++; 111 } 112 return c; 113 } 114 115 /** 116 * Same as {@link #read()} but skips over any whitespace characters. 117 * 118 * @return The first non-whitespace character, or -1 if the end of stream reached. 119 * @throws IOException 120 */ 121 public final int readSkipWs() throws IOException { 122 while (true) { 123 int c = read(); 124 if (c == -1 || ! Character.isWhitespace(c)) 125 return c; 126 } 127 } 128 129 /** 130 * Same as {@link #read()} but detects and combines extended unicode characters (i.e. characters above 0x10000). 131 * 132 * @return The character read, or -1 if the end of the stream has been reached. 133 * @throws IOException If a problem occurred trying to read from the reader. 134 */ 135 public final int readCodePoint() throws IOException { 136 int c = read(); 137 138 // Characters that take up 2 chars. 139 if (c >= 0xd800 && c <= 0xdbff) { 140 int low = read(); 141 if (low >= 0xdc00 && low <= 0xdfff) 142 c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00); 143 } 144 145 return c; 146 } 147 148 private final int readFromBuff() throws IOException { 149 while (iCurrent >= iEnd) { 150 if (endReached) 151 return -1; 152 153 // If there's still space at the end of this buffer, fill it. 154 // Make sure there's at least 2 character spaces free for extended unicode characters. 155 //if (false) { 156 if (iEnd+1 < buff.length) { 157 int x = read(buff, iCurrent, buff.length-iEnd); 158 if (x == -1) { 159 endReached = true; 160 return -1; 161 } 162 iEnd += x; 163 164 } else { 165 // If we're currently marking, then we want to copy from the current mark point 166 // to the beginning of the buffer and then fill in the remainder of buffer. 167 if (iMark >= 0) { 168 169 // If we're marking from the beginning of the array, we double the size of the 170 // buffer. This isn't likely to occur often. 171 if (iMark == 0) { 172 char[] buff2 = new char[buff.length<<1]; 173 System.arraycopy(buff, 0, buff2, 0, buff.length); 174 buff = buff2; 175 176 // Otherwise, we copy what's currently marked to the beginning of the buffer. 177 } else { 178 int copyBuff = iMark; 179 System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff); 180 iCurrent -= copyBuff; 181 iMark -= copyBuff; 182 } 183 int expected = buff.length - iCurrent; 184 185 int x = read(buff, iCurrent, expected); 186 if (x == -1) { 187 endReached = true; 188 iEnd = iCurrent; 189 return -1; 190 } 191 iEnd = iCurrent + x; 192 } else { 193 // Copy the last 10 chars in the buffer to the beginning of the buffer. 194 int copyBuff = Math.min(iCurrent, 10); 195 System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff); 196 197 // Number of characters we expect to copy on the next read. 198 int expected = buff.length - copyBuff; 199 int x = read(buff, copyBuff, expected); 200 iCurrent = copyBuff; 201 if (x == -1) { 202 endReached = true; 203 iEnd = iCurrent; 204 return -1; 205 } 206 iEnd = iCurrent + x; 207 } 208 } 209 } 210 return buff[iCurrent++]; 211 } 212 213 /** 214 * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}. 215 */ 216 public final void mark() { 217 iMark = iCurrent; 218 } 219 220 /** 221 * Peeks the next character in the stream. 222 * 223 * <p> 224 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 225 * 226 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 227 * @throws IOException If a problem occurred trying to read from the reader. 228 */ 229 public final int peek() throws IOException { 230 int c = read(); 231 if (c != -1) 232 unread(); 233 return c; 234 } 235 236 /** 237 * Same as {@link #peek()} but skips over any whitespace characters. 238 * 239 * <p> 240 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 241 * 242 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 243 * @throws IOException If a problem occurred trying to read from the reader. 244 */ 245 public final int peekSkipWs() throws IOException { 246 while(true) { 247 int c = read(); 248 boolean isWs = Character.isWhitespace(c); 249 if (c != -1 && ! isWs) 250 unread(); 251 if (! isWs) 252 return c; 253 } 254 } 255 256 /** 257 * Read the specified number of characters off the stream. 258 * 259 * @param num The number of characters to read. 260 * @return The characters packaged as a String. 261 * @throws IOException If a problem occurred trying to read from the reader. 262 */ 263 public final String read(int num) throws IOException { 264 char[] c = new char[num]; 265 for (int i = 0; i < num; i++) { 266 int c2 = read(); 267 if (c2 == -1) 268 return new String(c, 0, i); 269 c[i] = (char)c2; 270 } 271 return new String(c); 272 } 273 274 /** 275 * Pushes the last read character back into the stream. 276 * 277 * @return This object (for method chaining). 278 * @throws IOException If a problem occurred trying to read from the reader. 279 */ 280 public ParserReader unread() throws IOException { 281 if (iCurrent <= 0) 282 throw new IOException("Buffer underflow."); 283 iCurrent--; 284 column--; 285 return this; 286 } 287 288 /** 289 * No-op. 290 * 291 * <p> 292 * Input readers are closed in the {@link ParserPipe} class. 293 * 294 * @throws IOException If a problem occurred trying to read from the reader. 295 */ 296 @Override /* Reader */ 297 public void close() throws IOException { 298 // No-op 299 } 300 301 /** 302 * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage. 303 * 304 * @return The contents of the reusable character buffer as a string. 305 */ 306 public final String getMarked() { 307 return getMarked(0, 0); 308 } 309 310 /** 311 * Same as {@link #getMarked()} except allows you to specify offsets into the buffer. 312 * 313 * <p> 314 * For example, to return the marked string, but trim the first and last characters, call the following: 315 * <p class='bcode'> 316 * getFromMarked(1, -1); 317 * </p> 318 * 319 * @param offsetStart The offset of the start position. 320 * @param offsetEnd The offset of the end position. 321 * @return The contents of the reusable character buffer as a string. 322 */ 323 public final String getMarked(int offsetStart, int offsetEnd) { 324 int offset = 0; 325 326 // Holes are \u00FF 'delete' characters that we need to get rid of now. 327 if (holesExist) { 328 for (int i = iMark; i < iCurrent; i++) { 329 char c = buff[i]; 330 if (c == 127) 331 offset++; 332 else 333 buff[i-offset] = c; 334 } 335 holesExist = false; 336 } 337 int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset; 338 String s = new String(buff, start, len); 339 iMark = -1; 340 return s; 341 } 342 343 /** 344 * Trims off the last character in the marking buffer. 345 * 346 * <p> 347 * Useful for removing escape characters from sequences. 348 * 349 * @return This object (for method chaining). 350 */ 351 public final ParserReader delete() { 352 return delete(1); 353 } 354 355 /** 356 * Trims off the specified number of last characters in the marking buffer. 357 * Useful for removing escape characters from sequences. 358 * 359 * @param count The number of characters to delete. 360 * @return This object (for method chaining). 361 */ 362 public final ParserReader delete(int count) { 363 for (int i = 0; i < count; i++) 364 buff[iCurrent-i-1] = 127; 365 holesExist = true; 366 return this; 367 } 368 369 /** 370 * Replaces the last character in the marking buffer with the specified character. 371 * 372 * <p> 373 * <code>offset</code> must be at least <code>1</code> for normal characters, and <code>2</code> for extended 374 * unicode characters in order for the replacement to fit into the buffer. 375 * 376 * @param c The new character. 377 * @param offset The offset. 378 * @return This object (for method chaining). 379 * @throws IOException 380 */ 381 public final ParserReader replace(int c, int offset) throws IOException { 382 if (c < 0x10000) { 383 if (offset < 1) 384 throw new IOException("Buffer underflow."); 385 buff[iCurrent-offset] = (char)c; 386 } else { 387 if (offset < 2) 388 throw new IOException("Buffer underflow."); 389 c -= 0x10000; 390 buff[iCurrent-offset] = (char)(0xd800 + (c >> 10)); 391 buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff)); 392 offset--; 393 } 394 // Fill in the gap with DEL characters. 395 for (int i = 1; i < offset; i++) 396 buff[iCurrent-i] = 127; 397 holesExist |= (offset > 1); 398 return this; 399 } 400 401 /** 402 * Replace the last read character in the buffer with the specified character. 403 * 404 * @param c The new character. 405 * @return This object (for method chaining). 406 * @throws IOException 407 */ 408 public final ParserReader replace(char c) throws IOException { 409 return replace(c, 1); 410 } 411 412 /** 413 * Subclasses can override this method to provide additional filtering. 414 * 415 * <p> 416 * Default implementation simply calls the same method on the underlying reader. 417 */ 418 @Override /* Reader */ 419 public int read(char[] cbuf, int off, int len) throws IOException { 420 return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len); 421 } 422 423 /** 424 * Returns the combined location information on both this reader and the session. 425 * 426 * @param session The session object to read the last location on. 427 * @return A new map describing the current parse location. 428 */ 429 public ObjectMap getLocation(ParserSession session) { 430 return session.getLastLocation().append("line", getLine()).append("column", getColumn()); 431 } 432 433 /** 434 * Returns the pipe that was passed into the constructor. 435 * 436 * @return The pipe that was passed into the constructor. 437 */ 438 public final ParserPipe getPipe() { 439 return pipe; 440 } 441}