001// *************************************************************************************************************************** 002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file * 003// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * 004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance * 005// * with the License. You may obtain a copy of the License at * 006// * * 007// * http://www.apache.org/licenses/LICENSE-2.0 * 008// * * 009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an * 010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * 011// * specific language governing permissions and limitations under the License. * 012// *************************************************************************************************************************** 013package org.apache.juneau.parser; 014 015import java.io.*; 016 017import org.apache.juneau.internal.*; 018 019/** 020 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character. 021 * 022 * <p> 023 * Code is optimized to work with a 1 character buffer. 024 * 025 * <p> 026 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture 027 * characters from the previous mark point. 028 * 029 * <p> 030 * <b>Warning:</b> Not thread safe. 031 */ 032public class ParserReader extends Reader implements Positionable { 033 034 /** Wrapped reader */ 035 protected final Reader r; 036 037 private char[] buff; // Internal character buffer 038 private int line = 1; // Current line number 039 private int column; // Current column number 040 private int iCurrent = 0; // Current pointer into character buffer 041 private int iMark = -1; // Mark position in buffer 042 private int iEnd = 0; // The last good character position in the buffer 043 private boolean endReached, holesExist; 044 private final boolean unbuffered; 045 046 /** 047 * Constructor. 048 * 049 * @param pipe The parser input. 050 * @throws IOException Thrown by underlying stream. 051 */ 052 public ParserReader(ParserPipe pipe) throws IOException { 053 this.unbuffered = pipe.unbuffered; 054 if (pipe.isString()) { 055 String in = pipe.getInputAsString(); 056 this.r = new CharSequenceReader(in); 057 this.buff = new char[in.length() < 1024 ? in.length() : 1024]; 058 } else { 059 Reader _r = pipe.getReader(); 060 if (_r instanceof ParserReader) 061 this.r = ((ParserReader)_r).r; 062 else 063 this.r = _r; 064 this.buff = new char[1024]; 065 } 066 pipe.setPositionable(this); 067 } 068 069 /** 070 * Reads a single character. 071 * 072 * <p> 073 * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather 074 * returns them as two <jk>char</jk>s. 075 * Use {@link #readCodePoint()} to ensure proper handling of extended unicode. 076 * 077 * @return The character read, or -1 if the end of the stream has been reached. 078 * @throws IOException If a problem occurred trying to read from the reader. 079 */ 080 @Override /* Reader */ 081 public final int read() throws IOException { 082 int c = readFromBuff(); 083 if (c == -1) 084 return -1; 085 if (c == '\n') { 086 line++; 087 column = 0; 088 } else { 089 column++; 090 } 091 return c; 092 } 093 094 /** 095 * Same as {@link #read()} but skips over any whitespace characters. 096 * 097 * @return The first non-whitespace character, or -1 if the end of stream reached. 098 * @throws IOException Thrown by underlying stream. 099 */ 100 public final int readSkipWs() throws IOException { 101 while (true) { 102 int c = read(); 103 if (c == -1 || ! Character.isWhitespace(c)) 104 return c; 105 } 106 } 107 108 /** 109 * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000). 110 * 111 * @return The character read, or -1 if the end of the stream has been reached. 112 * @throws IOException If a problem occurred trying to read from the reader. 113 */ 114 public final int readCodePoint() throws IOException { 115 int c = read(); 116 117 // Characters that take up 2 chars. 118 if (c >= 0xd800 && c <= 0xdbff) { 119 int low = read(); 120 if (low >= 0xdc00 && low <= 0xdfff) 121 c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00); 122 } 123 124 return c; 125 } 126 127 private final int readFromBuff() throws IOException { 128 while (iCurrent >= iEnd) { 129 if (endReached) 130 return -1; 131 132 // If there's still space at the end of this buffer, fill it. 133 // Make sure there's at least 2 character spaces free for extended unicode characters. 134 //if (false) { 135 if (iEnd+1 < buff.length) { 136 int x = read(buff, iCurrent, buff.length-iEnd); 137 if (x == -1) { 138 endReached = true; 139 return -1; 140 } 141 iEnd += x; 142 143 } else { 144 // If we're currently marking, then we want to copy from the current mark point 145 // to the beginning of the buffer and then fill in the remainder of buffer. 146 if (iMark >= 0) { 147 148 // If we're marking from the beginning of the array, we double the size of the 149 // buffer. This isn't likely to occur often. 150 if (iMark == 0) { 151 char[] buff2 = new char[buff.length<<1]; 152 System.arraycopy(buff, 0, buff2, 0, buff.length); 153 buff = buff2; 154 155 // Otherwise, we copy what's currently marked to the beginning of the buffer. 156 } else { 157 int copyBuff = iMark; 158 System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff); 159 iCurrent -= copyBuff; 160 iMark -= copyBuff; 161 } 162 int expected = buff.length - iCurrent; 163 164 int x = read(buff, iCurrent, expected); 165 if (x == -1) { 166 endReached = true; 167 iEnd = iCurrent; 168 return -1; 169 } 170 iEnd = iCurrent + x; 171 } else { 172 // Copy the last 10 chars in the buffer to the beginning of the buffer. 173 int copyBuff = Math.min(iCurrent, 10); 174 System.arraycopy(buff, iCurrent-copyBuff, buff, 0, copyBuff); 175 176 // Number of characters we expect to copy on the next read. 177 int expected = buff.length - copyBuff; 178 int x = read(buff, copyBuff, expected); 179 iCurrent = copyBuff; 180 if (x == -1) { 181 endReached = true; 182 iEnd = iCurrent; 183 return -1; 184 } 185 iEnd = iCurrent + x; 186 } 187 } 188 } 189 return buff[iCurrent++]; 190 } 191 192 /** 193 * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}. 194 */ 195 public final void mark() { 196 iMark = iCurrent; 197 } 198 199 /** 200 * Peeks the next character in the stream. 201 * 202 * <p> 203 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 204 * 205 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 206 * @throws IOException If a problem occurred trying to read from the reader. 207 */ 208 public final int peek() throws IOException { 209 int c = read(); 210 if (c != -1) 211 unread(); 212 return c; 213 } 214 215 /** 216 * Same as {@link #peek()} but skips over any whitespace characters. 217 * 218 * <p> 219 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 220 * 221 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 222 * @throws IOException If a problem occurred trying to read from the reader. 223 */ 224 public final int peekSkipWs() throws IOException { 225 while(true) { 226 int c = read(); 227 boolean isWs = Character.isWhitespace(c); 228 if (c != -1 && ! isWs) 229 unread(); 230 if (! isWs) 231 return c; 232 } 233 } 234 235 /** 236 * Read the specified number of characters off the stream. 237 * 238 * @param num The number of characters to read. 239 * @return The characters packaged as a String. 240 * @throws IOException If a problem occurred trying to read from the reader. 241 */ 242 public final String read(int num) throws IOException { 243 char[] c = new char[num]; 244 for (int i = 0; i < num; i++) { 245 int c2 = read(); 246 if (c2 == -1) 247 return new String(c, 0, i); 248 c[i] = (char)c2; 249 } 250 return new String(c); 251 } 252 253 /** 254 * Pushes the last read character back into the stream. 255 * 256 * @return This object (for method chaining). 257 * @throws IOException If a problem occurred trying to read from the reader. 258 */ 259 public ParserReader unread() throws IOException { 260 if (iCurrent <= 0) 261 throw new IOException("Buffer underflow."); 262 iCurrent--; 263 if (column == 0) 264 line--; 265 else 266 column--; 267 return this; 268 } 269 270 /** 271 * No-op. 272 * 273 * <p> 274 * Input readers are closed in the {@link ParserPipe} class. 275 * 276 * @throws IOException If a problem occurred trying to read from the reader. 277 */ 278 @Override /* Reader */ 279 public void close() throws IOException { 280 // No-op 281 } 282 283 /** 284 * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage. 285 * 286 * @return The contents of the reusable character buffer as a string. 287 */ 288 public final String getMarked() { 289 return getMarked(0, 0); 290 } 291 292 /** 293 * Same as {@link #getMarked()} except allows you to specify offsets into the buffer. 294 * 295 * <p> 296 * For example, to return the marked string, but trim the first and last characters, call the following: 297 * <p class='bcode w800'> 298 * getFromMarked(1, -1); 299 * </p> 300 * 301 * @param offsetStart The offset of the start position. 302 * @param offsetEnd The offset of the end position. 303 * @return The contents of the reusable character buffer as a string. 304 */ 305 public final String getMarked(int offsetStart, int offsetEnd) { 306 int offset = 0; 307 308 // Holes are \u00FF 'delete' characters that we need to get rid of now. 309 if (holesExist) { 310 for (int i = iMark; i < iCurrent; i++) { 311 char c = buff[i]; 312 if (c == 127) 313 offset++; 314 else 315 buff[i-offset] = c; 316 } 317 holesExist = false; 318 } 319 int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset; 320 String s = new String(buff, start, len); 321 iMark = -1; 322 return s; 323 } 324 325 /** 326 * Trims off the last character in the marking buffer. 327 * 328 * <p> 329 * Useful for removing escape characters from sequences. 330 * 331 * @return This object (for method chaining). 332 */ 333 public final ParserReader delete() { 334 return delete(1); 335 } 336 337 /** 338 * Trims off the specified number of last characters in the marking buffer. 339 * Useful for removing escape characters from sequences. 340 * 341 * @param count The number of characters to delete. 342 * @return This object (for method chaining). 343 */ 344 public final ParserReader delete(int count) { 345 for (int i = 0; i < count; i++) 346 buff[iCurrent-i-1] = 127; 347 holesExist = true; 348 return this; 349 } 350 351 /** 352 * Replaces the last character in the marking buffer with the specified character. 353 * 354 * <p> 355 * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended 356 * unicode characters in order for the replacement to fit into the buffer. 357 * 358 * @param c The new character. 359 * @param offset The offset. 360 * @return This object (for method chaining). 361 * @throws IOException Thrown by underlying stream. 362 */ 363 public final ParserReader replace(int c, int offset) throws IOException { 364 if (c < 0x10000) { 365 if (offset < 1) 366 throw new IOException("Buffer underflow."); 367 buff[iCurrent-offset] = (char)c; 368 } else { 369 if (offset < 2) 370 throw new IOException("Buffer underflow."); 371 c -= 0x10000; 372 buff[iCurrent-offset] = (char)(0xd800 + (c >> 10)); 373 buff[iCurrent-offset+1] = (char)(0xdc00 + (c & 0x3ff)); 374 offset--; 375 } 376 // Fill in the gap with DEL characters. 377 for (int i = 1; i < offset; i++) 378 buff[iCurrent-i] = 127; 379 holesExist |= (offset > 1); 380 return this; 381 } 382 383 /** 384 * Replace the last read character in the buffer with the specified character. 385 * 386 * @param c The new character. 387 * @return This object (for method chaining). 388 * @throws IOException Thrown by underlying stream. 389 */ 390 public final ParserReader replace(char c) throws IOException { 391 return replace(c, 1); 392 } 393 394 /** 395 * Subclasses can override this method to provide additional filtering. 396 * 397 * <p> 398 * Default implementation simply calls the same method on the underlying reader. 399 */ 400 @Override /* Reader */ 401 public int read(char[] cbuf, int off, int len) throws IOException { 402 return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len); 403 } 404 405 @Override /* Positionable */ 406 public Position getPosition() { 407 return new Position(line, column); 408 } 409}