001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import org.jwat.common.Diagnosis; 021import org.jwat.common.Diagnostics; 022import org.jwat.common.HeaderLineReader; 023import org.jwat.common.UriProfile; 024 025import java.io.Closeable; 026import java.io.IOException; 027import java.io.InputStream; 028import java.util.Iterator; 029import java.util.NoSuchElementException; 030 031/** 032 * Base class for WARC reader implementations. 033 * 034 * @author nicl 035 */ 036public abstract class WarcReader implements Closeable { 037 038 /* 039 * Settings. 040 */ 041 042 /** WARC-Target-URI profile. */ 043 protected UriProfile warcTargetUriProfile; 044 045 /** URI profile. */ 046 protected UriProfile uriProfile; 047 048 /** Default block digest algorithm to use if none is present in the 049 * record. */ 050 protected String blockDigestAlgorithm; 051 052 /** Default encoding scheme used to encode block digest into a string, 053 * if none is detected from the record. */ 054 protected String blockDigestEncoding = "base32"; 055 056 /** Payload Digest enabled/disabled. */ 057 protected boolean bPayloadDigest = false; 058 059 /** Default payload digest algorithm to use if none is present in the 060 * record. */ 061 protected String payloadDigestAlgorithm; 062 063 /** Default encoding scheme used to encode payload digest into a string, 064 * if none is detected from the record. */ 065 protected String payloadDigestEncoding = "base32"; 066 067 /** Block Digest enabled/disabled. */ 068 protected boolean bBlockDigest = false; 069 070 /** Max size allowed for a record header. */ 071 protected int recordHeaderMaxSize; 072 073 /** Max size allowed for a payload header. */ 074 protected int payloadHeaderMaxSize; 075 076 /** Line reader used to read version lines. */ 077 protected HeaderLineReader lineReader; 078 079 /** Header line reader used to read the WARC headers. */ 080 protected HeaderLineReader headerLineReader; 081 082 /** WARC field parser used. */ 083 protected WarcFieldParsers fieldParsers; 084 085 /* 086 * State. 087 */ 088 089 /** Reader level errors and warnings or when no record is available. */ 090 public final Diagnostics<Diagnosis> diagnostics = new Diagnostics<Diagnosis>(); 091 092 /** Compliance status for records parsed up to now. */ 093 protected boolean bIsCompliant = true; 094 095 /** Number of bytes consumed by this reader. */ 096 protected long consumed = 0; 097 098 /** Records parsed. */ 099 protected int records = 0; 100 101 /** Aggregated number of errors encountered while parsing. */ 102 protected int errors = 0; 103 104 /** Aggregate number of warnings encountered while parsing. */ 105 protected int warnings = 0; 106 107 /** Current WARC record object. */ 108 protected WarcRecord currentRecord; 109 110 /** Exception thrown while using the iterator. */ 111 protected Exception iteratorExceptionThrown; 112 113 /** 114 * Method used to initialize a readers internal state. 115 * Must be called by all constructors. 116 */ 117 protected void init() { 118 warcTargetUriProfile = UriProfile.RFC3986; 119 uriProfile = UriProfile.RFC3986; 120 recordHeaderMaxSize = 8192; 121 payloadHeaderMaxSize = 32768; 122 lineReader = HeaderLineReader.getReader(); 123 lineReader.bNameValue = false; 124 lineReader.encoding = HeaderLineReader.ENC_US_ASCII; 125 headerLineReader = HeaderLineReader.getReader(); 126 headerLineReader.bNameValue = true; 127 headerLineReader.encoding = HeaderLineReader.ENC_UTF8; 128 headerLineReader.bLWS = true; 129 headerLineReader.bQuotedText = true; 130 headerLineReader.bEncodedWords = true; 131 fieldParsers = new WarcFieldParsers(); 132 } 133 134 /** 135 * Reset reader for reuse. 136 */ 137 public void reset() { 138 diagnostics.reset(); 139 bIsCompliant = true; 140 consumed = 0; 141 records = 0; 142 errors = 0; 143 warnings = 0; 144 currentRecord = null; 145 } 146 147 /** 148 * Returns a boolean indicating if all records parsed so far are compliant. 149 * @return a boolean indicating if all records parsed so far are compliant 150 */ 151 public boolean isCompliant() { 152 return bIsCompliant; 153 } 154 155 /** 156 * Is this reader assuming GZip compressed input. 157 * @return boolean indicating the assumption of GZip compressed input 158 */ 159 public abstract boolean isCompressed(); 160 161 /** 162 * Set the URI profile used to validate WARC-Target URIs. 163 * If null, the uriProfile is set to RCF3986. 164 * @param uriProfile URI profile to use 165 */ 166 public void setWarcTargetUriProfile(UriProfile uriProfile) { 167 if (uriProfile == null) { 168 uriProfile = UriProfile.RFC3986; 169 } 170 this.warcTargetUriProfile = uriProfile; 171 } 172 173 /** 174 * Get the URI profile used to validate WARC-Target URIs. 175 * @return the URI profile used to validate WARC-Target URIs 176 */ 177 public UriProfile getWarcTargetUriProfile() { 178 return warcTargetUriProfile; 179 } 180 181 /** 182 * Set the URI profile used to validate URIs. 183 * If null, the uriProfile is set to RCF3986. 184 * @param uriProfile URI profile to use 185 */ 186 public void setUriProfile(UriProfile uriProfile) { 187 if (uriProfile == null) { 188 uriProfile = UriProfile.RFC3986; 189 } 190 this.uriProfile = uriProfile; 191 } 192 193 /** 194 * Get the URI profile used to validate URIs. 195 * @return the URI profile used to validate URIs 196 */ 197 public UriProfile getUriProfile() { 198 return uriProfile; 199 } 200 201 /** 202 * Get the readers block digest on/off status. 203 * @return boolean indicating block digest on/off 204 */ 205 public boolean getBlockDigestEnabled() { 206 return bBlockDigest; 207 } 208 209 /** 210 * Set the readers block digest on/off status. Digest, however, 211 * will only be computed if either a Warc-Block-Digest header is 212 * present or an optional algorithm has been chosen. 213 * The Warc-Block-Digest always takes precedence. 214 * @param enabled boolean indicating block digest on/off 215 */ 216 public void setBlockDigestEnabled(boolean enabled) { 217 bBlockDigest = enabled; 218 } 219 220 /** 221 * Get the readers payload digest on/off status. 222 * @return boolean indicating payload digest on/off 223 */ 224 public boolean getPayloadDigestEnabled() { 225 return bPayloadDigest; 226 } 227 228 /** 229 * Set the readers payload digest on/off status. Digest, however, 230 * will only be computed if either a Warc-Payload-Digest header is 231 * present or an optional algorithm has been chosen. 232 * The Warc-Payload-Digest always takes precedence. 233 * @param enabled boolean indicating payload digest on/off 234 */ 235 public void setPayloadDigestEnabled(boolean enabled) { 236 bPayloadDigest = enabled; 237 } 238 239 /** 240 * Get the default block digest algorithm. 241 * @return default block digest algorithm 242 */ 243 public String getBlockDigestAlgorithm() { 244 return blockDigestAlgorithm; 245 } 246 247 /** 248 * Tries to set the default block digest algorithm and returns a boolean 249 * indicating whether the algorithm was accepted or not. This algorithm is 250 * only used in case no WARC payload digest header is present in the record. 251 * @param digestAlgorithm block digest algorithm 252 * (null means no default block digest algorithm is selected) 253 * @return boolean indicating the validity of the algorithm supplied 254 */ 255 public boolean setBlockDigestAlgorithm(String digestAlgorithm) { 256 if (digestAlgorithm == null || digestAlgorithm.length() == 0) { 257 blockDigestAlgorithm = null; 258 return true; 259 } 260 if (WarcDigest.digestAlgorithmLength(digestAlgorithm) > 0) { 261 blockDigestAlgorithm = digestAlgorithm; 262 return true; 263 } 264 return false; 265 } 266 267 /** 268 * Get the default payload digest algorithm. 269 * @return default payload digest algorithm 270 */ 271 public String getPayloadDigestAlgorithm() { 272 return payloadDigestAlgorithm; 273 } 274 275 /** 276 * Tries to set the default payload digest algorithm and returns a boolean 277 * indicating whether the algorithm was accepted or not. This algorithm is 278 * only used in case no WARC payload digest header is present in the record. 279 * @param digestAlgorithm payload digest algorithm 280 * (null means no default payload digest algorithm is selected) 281 * @return boolean indicating the validity of the algorithm supplied 282 */ 283 public boolean setPayloadDigestAlgorithm(String digestAlgorithm) { 284 if (digestAlgorithm == null || digestAlgorithm.length() == 0) { 285 payloadDigestAlgorithm = null; 286 return true; 287 } 288 if (WarcDigest.digestAlgorithmLength(digestAlgorithm) > 0) { 289 payloadDigestAlgorithm = digestAlgorithm; 290 return true; 291 } 292 return false; 293 } 294 295 /** 296 * Get the default block digest encoding scheme. 297 * @return default block digest encoding scheme 298 */ 299 public String getBlockDigestEncoding() { 300 return blockDigestEncoding; 301 } 302 303 /** 304 * Set the default block digest encoding scheme. This scheme is only 305 * used if none can be inferred from an existing block digest header. 306 * @param encodingScheme encoding scheme 307 * (null means default block digest is not encoded) 308 */ 309 public void setBlockDigestEncoding(String encodingScheme) { 310 if (encodingScheme != null && encodingScheme.length() > 0) { 311 blockDigestEncoding = encodingScheme.toLowerCase(); 312 } else { 313 blockDigestEncoding = null; 314 } 315 } 316 317 /** 318 * Get the default payload digest encoding scheme. 319 * @return default payload digest encoding scheme 320 */ 321 public String getPayloadDigestEncoding() { 322 return payloadDigestEncoding; 323 } 324 325 /** 326 * Set the default payload digest encoding scheme. This scheme is only 327 * used if none can be inferred from an existing payload digest header. 328 * @param encodingScheme encoding scheme 329 * (null means default payload digest is not encoded) 330 */ 331 public void setPayloadDigestEncoding(String encodingScheme) { 332 if (encodingScheme != null && encodingScheme.length() > 0) { 333 payloadDigestEncoding = encodingScheme.toLowerCase(); 334 } else { 335 payloadDigestEncoding = null; 336 } 337 } 338 339 /** 340 * Get the max size allowed for a record header. 341 * @return max size allowed for a record header 342 */ 343 public int getRecordHeaderMaxSize() { 344 return recordHeaderMaxSize; 345 } 346 347 /** 348 * Set the max size allowed for a record header. 349 * @param size max size allowed 350 */ 351 public void setRecordHeaderMaxSize(int size) { 352 recordHeaderMaxSize = size; 353 } 354 355 /** 356 * Get the max size allowed for a payload header. 357 * @return max size allowed for a payload header 358 */ 359 public int getPayloadHeaderMaxSize() { 360 return payloadHeaderMaxSize; 361 } 362 363 /** 364 * Set the max size allowed for a payload header. 365 * @param size max size allowed 366 */ 367 public void setPayloadHeaderMaxSize(int size) { 368 payloadHeaderMaxSize = size; 369 } 370 371 /** 372 * Close current record resource(s) and input stream(s). 373 */ 374 public abstract void close(); 375 376 /** 377 * Callback method called when the payload has been processed. 378 */ 379 protected abstract void recordClosed(); 380 381 /** 382 * Get the offset of the current WARC record or -1 if none have been read. 383 * @return offset of the current WARC record or -1 384 */ 385 public abstract long getStartOffset(); 386 387 /** 388 * Get the current offset in the WARC <code>InputStream</code>. 389 * @return offset in WARC <code>InputStream</code> 390 */ 391 public abstract long getOffset(); 392 393 /** 394 * Get number of bytes consumed by this reader. 395 * @return number of bytes consumed by this reader 396 */ 397 public abstract long getConsumed(); 398 399 /** 400 * Parses and gets the next record. 401 * This method is for linear access to records. 402 * @return the next record 403 * @throws IOException i/o exception in parsing process 404 */ 405 public abstract WarcRecord getNextRecord() throws IOException; 406 407 /** 408 * Parses and gets the next record from an <code>Inputstream</code>. 409 * This method is mainly for random access use since there are serious 410 * side-effects involved in using multiple <code>PushBackInputStream</code> 411 * instances. 412 * @param in <code>InputStream</code> used to read next record 413 * @param offset offset provided by caller 414 * @return the next record 415 * @throws IOException i/o exception in parsing process 416 */ 417 public abstract WarcRecord getNextRecordFrom(InputStream in, long offset) 418 throws IOException; 419 420 /** 421 * Parses and gets the next record from an <code>Inputstream</code> wrapped 422 * by a <code>BufferedInputStream</code>. 423 * This method is mainly for random access use since there are serious 424 * side-effects involved in using multiple <code>PushBackInputStream</code> 425 * instances. 426 * @param in <code>InputStream</code> used to read next record 427 * @param offset offset provided by caller 428 * @param buffer_size buffer size to use 429 * @return the next record 430 * @throws IOException i/o exception in parsing process 431 */ 432 public abstract WarcRecord getNextRecordFrom(InputStream in, long offset, 433 int buffer_size) throws IOException; 434 435 /** 436 * Gets an exception thrown in the iterator if any or null. 437 * @return exception thrown in the iterator if any or null 438 */ 439 public Exception getIteratorExceptionThrown() { 440 return iteratorExceptionThrown; 441 } 442 443 /** 444 * Returns an <code>Iterator</code> over the records as they are being 445 * parsed. Any exception thrown during parsing is accessible through the 446 * <code>getIteratorExceptionThrown</code> method. 447 * @return <code>Iterator</code> over the records 448 */ 449 public Iterator<WarcRecord> iterator() { 450 return new Iterator<WarcRecord>() { 451 452 /** Internal next record updated by either hasNext() or next(). */ 453 private WarcRecord next; 454 455 /** Entry returned by next(). */ 456 private WarcRecord current; 457 458 @Override 459 public boolean hasNext() { 460 if (next == null) { 461 iteratorExceptionThrown = null; 462 try { 463 next = getNextRecord(); 464 } catch (IOException e) { 465 iteratorExceptionThrown = e; 466 } 467 } 468 return (next != null); 469 } 470 471 @Override 472 public WarcRecord next() { 473 if (next == null) { 474 iteratorExceptionThrown = null; 475 try { 476 next = getNextRecord(); 477 } catch (IOException e) { 478 iteratorExceptionThrown = e; 479 } 480 } 481 if (next == null) { 482 throw new NoSuchElementException(); 483 } 484 current = next; 485 next = null; 486 return current; 487 } 488 489 @Override 490 public void remove() { 491 throw new UnsupportedOperationException(); 492 } 493 }; 494 } 495 496}