001// *************************************************************************************************************************** 002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file * 003// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * 004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance * 005// * with the License. You may obtain a copy of the License at * 006// * * 007// * http://www.apache.org/licenses/LICENSE-2.0 * 008// * * 009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an * 010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * 011// * specific language governing permissions and limitations under the License. * 012// *************************************************************************************************************************** 013package org.apache.juneau.parser; 014 015import java.io.*; 016 017import org.apache.juneau.*; 018import org.apache.juneau.internal.*; 019 020/** 021 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character. 022 * 023 * <p> 024 * Code is optimized to work with a 1 character buffer. 025 * 026 * <p> 027 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture 028 * characters from the previous mark point. 029 * 030 * <p> 031 * <b>Warning:</b> Not thread safe. 032 */ 033public class ParserReader extends Reader implements Positionable { 034 035 /** Wrapped reader */ 036 protected final Reader r; 037 038 private char[] buff; // Internal character buffer 039 private int line = 1; // Current line number 040 private int column; // Current column number 041 private int iCurrent = 0; // Current pointer into character buffer 042 private int iMark = -1; // Mark position in buffer 043 private int iEnd = 0; // The last good character position in the buffer 044 private boolean endReached, holesExist; 045 private final boolean unbuffered; 046 047 /** 048 * Constructor. 049 * 050 * @param pipe The parser input. 051 * @throws IOException 052 */ 053 public ParserReader(ParserPipe pipe) throws IOException { 054 this.unbuffered = pipe.unbuffered; 055 if (pipe.isString()) { 056 String in = pipe.getInputAsString(); 057 this.r = new CharSequenceReader(in); 058 this.buff = new char[in.length() < 1024 ? in.length() : 1024]; 059 } else { 060 Reader _r = pipe.getReader(); 061 if (_r instanceof ParserReader) 062 this.r = ((ParserReader)_r).r; 063 else 064 this.r = _r; 065 this.buff = new char[1024]; 066 } 067 pipe.setPositionable(this); 068 } 069 070 /** 071 * Reads a single character. 072 * 073 * <p> 074 * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather 075 * returns them as two <jk>char</jk>s. 076 * Use {@link #readCodePoint()} to ensure proper handling of extended unicode. 077 * 078 * @return The character read, or -1 if the end of the stream has been reached. 079 * @throws IOException If a problem occurred trying to read from the reader. 080 */ 081 @Override /* Reader */ 082 public final int read() throws IOException { 083 int c = readFromBuff(); 084 if (c == -1) 085 return -1; 086 if (c == '\n') { 087 line++; 088 column = 0; 089 } else { 090 column++; 091 } 092 return c; 093 } 094 095 /** 096 * Same as {@link #read()} but skips over any whitespace characters. 097 * 098 * @return The first non-whitespace character, or -1 if the end of stream reached. 099 * @throws IOException 100 */ 101 public final int readSkipWs() throws IOException { 102 while (true) { 103 int c = read(); 104 if (c == -1 || ! Character.isWhitespace(c)) 105 return c; 106 } 107 } 108 109 /** 110 * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000). 111 * 112 * @return The character read, or -1 if the end of the stream has been reached. 113 * @throws IOException If a problem occurred trying to read from the reader. 114 */ 115 public final int readCodePoint() throws IOException { 116 int c = read(); 117 118 // Characters that take up 2 chars. 119 if (c >= 0xd800 && c <= 0xdbff) { 120 int low = read(); 121 if (low >= 0xdc00 && low <= 0xdfff) 122 c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00); 123 } 124 125 return c; 126 } 127 128 private final int readFromBuff() throws IOException { 129 while (iCurrent >= iEnd) { 130 if (endReached) 131 return -1; 132 133 // If there's still space at the end of this buffer, fill it. 134 // Make sure there's at least 2 character spaces free for extended unicode characters. 135 //if (false) { 136 if (iEnd+1 < buff.length) { 137 int x = read(buff, iCurrent, buff.length-iEnd); 138 if (x == -1) { 139 endReached = true; 140 return -1; 141 } 142 iEnd += x; 143 144 } else { 145 // If we're currently marking, then we want to copy from the current mark point 146 // to the beginning of the buffer and then fill in the remainder of buffer. 147 if (iMark >= 0) { 148 149 // If we're marking from the beginning of the array, we double the size of the 150 // buffer. This isn't likely to occur often. 151 if (iMark == 0) { 152 char[] buff2 = new char[buff.length<<1]; 153 System.arraycopy(buff, 0, buff2, 0, buff.length); 154 buff = buff2; 155 156 // Otherwise, we copy what's currently marked to the beginning of the buffer. 157 } else { 158 int copyBuff = iMark; 159 System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff); 160 iCurrent -= copyBuff; 161 iMark -= copyBuff; 162 } 163 int expected = buff.length - iCurrent; 164 165 int x = read(buff, iCurrent, expected); 166 if (x == -1) { 167 endReached = true; 168 iEnd = iCurrent; 169 return -1; 170 } 171 iEnd = iCurrent + x; 172 } else { 173 // Copy the last 10 chars in the buffer to the beginning of the buffer. 174 int copyBuff = Math.min(iCurrent, 10); 175 System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff); 176 177 // Number of characters we expect to copy on the next read. 178 int expected = buff.length - copyBuff; 179 int x = read(buff, copyBuff, expected); 180 iCurrent = copyBuff; 181 if (x == -1) { 182 endReached = true; 183 iEnd = iCurrent; 184 return -1; 185 } 186 iEnd = iCurrent + x; 187 } 188 } 189 } 190 return buff[iCurrent++]; 191 } 192 193 /** 194 * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}. 195 */ 196 public final void mark() { 197 iMark = iCurrent; 198 } 199 200 /** 201 * Peeks the next character in the stream. 202 * 203 * <p> 204 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 205 * 206 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 207 * @throws IOException If a problem occurred trying to read from the reader. 208 */ 209 public final int peek() throws IOException { 210 int c = read(); 211 if (c != -1) 212 unread(); 213 return c; 214 } 215 216 /** 217 * Same as {@link #peek()} but skips over any whitespace characters. 218 * 219 * <p> 220 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 221 * 222 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 223 * @throws IOException If a problem occurred trying to read from the reader. 224 */ 225 public final int peekSkipWs() throws IOException { 226 while(true) { 227 int c = read(); 228 boolean isWs = Character.isWhitespace(c); 229 if (c != -1 && ! isWs) 230 unread(); 231 if (! isWs) 232 return c; 233 } 234 } 235 236 /** 237 * Read the specified number of characters off the stream. 238 * 239 * @param num The number of characters to read. 240 * @return The characters packaged as a String. 241 * @throws IOException If a problem occurred trying to read from the reader. 242 */ 243 public final String read(int num) throws IOException { 244 char[] c = new char[num]; 245 for (int i = 0; i < num; i++) { 246 int c2 = read(); 247 if (c2 == -1) 248 return new String(c, 0, i); 249 c[i] = (char)c2; 250 } 251 return new String(c); 252 } 253 254 /** 255 * Pushes the last read character back into the stream. 256 * 257 * @return This object (for method chaining). 258 * @throws IOException If a problem occurred trying to read from the reader. 259 */ 260 public ParserReader unread() throws IOException { 261 if (iCurrent <= 0) 262 throw new IOException("Buffer underflow."); 263 iCurrent--; 264 if (column == 0) 265 line--; 266 else 267 column--; 268 return this; 269 } 270 271 /** 272 * No-op. 273 * 274 * <p> 275 * Input readers are closed in the {@link ParserPipe} class. 276 * 277 * @throws IOException If a problem occurred trying to read from the reader. 278 */ 279 @Override /* Reader */ 280 public void close() throws IOException { 281 // No-op 282 } 283 284 /** 285 * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage. 286 * 287 * @return The contents of the reusable character buffer as a string. 288 */ 289 public final String getMarked() { 290 return getMarked(0, 0); 291 } 292 293 /** 294 * Same as {@link #getMarked()} except allows you to specify offsets into the buffer. 295 * 296 * <p> 297 * For example, to return the marked string, but trim the first and last characters, call the following: 298 * <p class='bcode w800'> 299 * getFromMarked(1, -1); 300 * </p> 301 * 302 * @param offsetStart The offset of the start position. 303 * @param offsetEnd The offset of the end position. 304 * @return The contents of the reusable character buffer as a string. 305 */ 306 public final String getMarked(int offsetStart, int offsetEnd) { 307 int offset = 0; 308 309 // Holes are \u00FF 'delete' characters that we need to get rid of now. 310 if (holesExist) { 311 for (int i = iMark; i < iCurrent; i++) { 312 char c = buff[i]; 313 if (c == 127) 314 offset++; 315 else 316 buff[i-offset] = c; 317 } 318 holesExist = false; 319 } 320 int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset; 321 String s = new String(buff, start, len); 322 iMark = -1; 323 return s; 324 } 325 326 /** 327 * Trims off the last character in the marking buffer. 328 * 329 * <p> 330 * Useful for removing escape characters from sequences. 331 * 332 * @return This object (for method chaining). 333 */ 334 public final ParserReader delete() { 335 return delete(1); 336 } 337 338 /** 339 * Trims off the specified number of last characters in the marking buffer. 340 * Useful for removing escape characters from sequences. 341 * 342 * @param count The number of characters to delete. 343 * @return This object (for method chaining). 344 */ 345 public final ParserReader delete(int count) { 346 for (int i = 0; i < count; i++) 347 buff[iCurrent-i-1] = 127; 348 holesExist = true; 349 return this; 350 } 351 352 /** 353 * Replaces the last character in the marking buffer with the specified character. 354 * 355 * <p> 356 * <code>offset</code> must be at least <code>1</code> for normal characters, and <code>2</code> for extended 357 * unicode characters in order for the replacement to fit into the buffer. 358 * 359 * @param c The new character. 360 * @param offset The offset. 361 * @return This object (for method chaining). 362 * @throws IOException 363 */ 364 public final ParserReader replace(int c, int offset) throws IOException { 365 if (c < 0x10000) { 366 if (offset < 1) 367 throw new IOException("Buffer underflow."); 368 buff[iCurrent-offset] = (char)c; 369 } else { 370 if (offset < 2) 371 throw new IOException("Buffer underflow."); 372 c -= 0x10000; 373 buff[iCurrent-offset] = (char)(0xd800 + (c >> 10)); 374 buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff)); 375 offset--; 376 } 377 // Fill in the gap with DEL characters. 378 for (int i = 1; i < offset; i++) 379 buff[iCurrent-i] = 127; 380 holesExist |= (offset > 1); 381 return this; 382 } 383 384 /** 385 * Replace the last read character in the buffer with the specified character. 386 * 387 * @param c The new character. 388 * @return This object (for method chaining). 389 * @throws IOException 390 */ 391 public final ParserReader replace(char c) throws IOException { 392 return replace(c, 1); 393 } 394 395 /** 396 * Subclasses can override this method to provide additional filtering. 397 * 398 * <p> 399 * Default implementation simply calls the same method on the underlying reader. 400 */ 401 @Override /* Reader */ 402 public int read(char[] cbuf, int off, int len) throws IOException { 403 return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len); 404 } 405 406 @Override /* Positionable */ 407 public Position getPosition() { 408 return new Position(line, column); 409 } 410 411 /** 412 * @deprecated Unused. 413 */ 414 @SuppressWarnings("javadoc") 415 @Deprecated 416 public final int getLine() { 417 return line; 418 } 419 420 /** 421 * @deprecated Unused. 422 */ 423 @SuppressWarnings("javadoc") 424 @Deprecated 425 public final int getColumn() { 426 return column; 427 } 428 429 /** 430 * @deprecated Unused. 431 */ 432 @SuppressWarnings("javadoc") 433 @Deprecated 434 public ObjectMap getLocation(ParserSession session) { 435 return session.getLastLocation().append("line", getLine()).append("column", getColumn()); 436 } 437 438 /** 439 * @deprecated Unused. 440 */ 441 @SuppressWarnings("javadoc") 442 @Deprecated 443 public final ParserPipe getPipe() { 444 return null; 445 } 446}