001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import org.jwat.common.Base16; 021import org.jwat.common.Base32; 022import org.jwat.common.Base64; 023import org.jwat.common.ByteCountingPushBackInputStream; 024import org.jwat.common.Diagnosis; 025import org.jwat.common.DiagnosisType; 026import org.jwat.common.Diagnostics; 027import org.jwat.common.HeaderLine; 028import org.jwat.common.HttpHeader; 029import org.jwat.common.NewlineParser; 030import org.jwat.common.Payload; 031import org.jwat.common.PayloadOnClosedHandler; 032 033import java.io.Closeable; 034import java.io.IOException; 035import java.io.InputStream; 036import java.util.Arrays; 037import java.util.Collections; 038import java.util.List; 039 040/** 041 * This class represents a parsed WARC record header block including 042 * possible validation and format warnings/errors encountered in the process. 043 * The payload of the WARC record is accessible through a wrapped payload 044 * object. 045 * 046 * @author nicl 047 */ 048public class WarcRecord implements PayloadOnClosedHandler, Closeable { 049 050 /** Reader instance used, required for file compliance. */ 051 protected WarcReader reader; 052 053 /** Input stream used to read this record. */ 054 protected ByteCountingPushBackInputStream in; 055 056 /** Is this record compliant ie. error free. */ 057 protected boolean bIsCompliant; 058 059 /** WARC record parsing start offset relative to the source WARC file input 060 * stream. Used to keep track of the uncompressed amount of bytes consumed. */ 061 protected long startOffset = -1; 062 063 /** Uncompressed bytes consumed while validating this record. */ 064 protected long consumed; 065 066 /** Validation errors and warnings. */ 067 public final Diagnostics<Diagnosis> diagnostics = new Diagnostics<Diagnosis>(); 068 069 /** Newline parser for counting/validating trailing newlines. */ 070 public NewlineParser nlp = new NewlineParser(); 071 072 /** Is Warc-Block-Digest valid. (Null is equal to not tested) */ 073 public Boolean isValidBlockDigest = null; 074 075 /** Is Warc-Payload-Digest valid. (Null is equal to not tested) */ 076 public Boolean isValidPayloadDigest = null; 077 078 /** Number of trailing newlines after record. */ 079 public int trailingNewlines; 080 081 /* 082 * Header-Fields. 083 */ 084 085 /** WARC header. */ 086 public WarcHeader header; 087 088 /* 089 * Payload 090 */ 091 092 /** Has payload been closed before. */ 093 protected boolean bPayloadClosed; 094 095 /** Has record been closed before. */ 096 protected boolean bClosed; 097 098 /** Payload object if any exists. */ 099 protected Payload payload; 100 101 /** HTTP header content parsed from payload. */ 102 protected HttpHeader httpHeader; 103 104 /** Computed block digest. */ 105 public WarcDigest computedBlockDigest; 106 107 /** Computed payload digest. */ 108 public WarcDigest computedPayloadDigest; 109 110 /** 111 * Non public constructor to allow unit testing. 112 */ 113 protected WarcRecord() { 114 } 115 116 /** 117 * Create a <code>WarcRecord</code> and prepare it for writing. 118 * @param writer writer which will be used to write the record 119 * @return a <code>WarcRecord</code> ready to be changed and then written 120 */ 121 public static WarcRecord createRecord(WarcWriter writer) { 122 WarcRecord record = new WarcRecord(); 123 record.header = WarcHeader.initHeader(writer, record.diagnostics); 124 writer.fieldParsers.diagnostics = record.diagnostics; 125 return record; 126 } 127 128 /** 129 * Given an <code>InputStream</code> it tries to read and validate a WARC 130 * header block. 131 * @param in <code>InputStream</code> containing WARC record data 132 * @param reader <code>WarcReader</code> used, with access to user defined 133 * options 134 * @return <code>WarcRecord</code> or <code>null</code> 135 * @throws IOException i/o exception in the process of reading record 136 */ 137 public static WarcRecord parseRecord(ByteCountingPushBackInputStream in, 138 WarcReader reader) throws IOException { 139 WarcRecord record = new WarcRecord(); 140 record.in = in; 141 record.reader = reader; 142 record.startOffset = in.getConsumed(); 143 // Initialize WarcHeader with required context. 144 record.header = WarcHeader.initHeader(reader, in.getConsumed(), record.diagnostics); 145 WarcHeader header = record.header; 146 // Initialize WarcFieldParser to report diagnoses here. 147 reader.fieldParsers.diagnostics = record.diagnostics; 148 if (header.parseHeader(in)) { 149 ++reader.records; 150 /* 151 * Payload processing. 152 */ 153 if (header.contentLength != null && header.contentLength > 0) { 154 /* 155 * Payload. 156 */ 157 String digestAlgorithm = null; 158 if (reader.bBlockDigest) { 159 if (header.warcBlockDigest != null && header.warcBlockDigest.algorithm != null) { 160 // If a WARC block digest header is present in the 161 // record, use that algorithm. 162 digestAlgorithm = header.warcBlockDigest.algorithm; 163 } else { 164 // If no WARC block digest header is present, 165 // use the optional user specified algorithm. 166 // Can be null in which case nothing is computed. 167 digestAlgorithm = reader.blockDigestAlgorithm; 168 } 169 } 170 record.payload = Payload.processPayload(in, header.contentLength, 171 reader.payloadHeaderMaxSize, digestAlgorithm); 172 record.payload.setOnClosedHandler(record); 173 /* 174 * HttpHeader. 175 */ 176 if (header.contentType != null 177 && header.contentType.contentType.equals("application") 178 && header.contentType.mediaType.equals("http")) { 179 String value = header.contentType.getParameter("msgtype"); 180 // request 181 int httpHeaderType = 0; 182 if ("response".equalsIgnoreCase(value)) { 183 httpHeaderType = HttpHeader.HT_RESPONSE; 184 } else if ("request".equalsIgnoreCase(value)) { 185 httpHeaderType = HttpHeader.HT_REQUEST; 186 } 187 if (httpHeaderType != 0) { 188 digestAlgorithm = null; 189 if (reader.bPayloadDigest) { 190 if (header.warcPayloadDigest != null && header.warcPayloadDigest.algorithm != null) { 191 // If a WARC payload digest header is present in the 192 // record, use that algorithm. 193 digestAlgorithm = header.warcPayloadDigest.algorithm; 194 } else { 195 // If no WARC payload digest header is present, 196 // use the optional user specified algorithm. 197 // Can be null in which case nothing is computed. 198 digestAlgorithm = reader.payloadDigestAlgorithm; 199 } 200 } 201 // Try to read a valid HTTP request/response header from the payload. 202 record.httpHeader = HttpHeader.processPayload(httpHeaderType, 203 record.payload.getInputStream(), header.contentLength, 204 digestAlgorithm); 205 if (record.httpHeader != null) { 206 if (record.httpHeader.isValid()) { 207 record.payload.setPayloadHeaderWrapped(record.httpHeader); 208 } else { 209 record.diagnostics.addError( 210 new Diagnosis(DiagnosisType.ERROR, 211 "http header", 212 "Unable to parse http header!")); 213 } 214 } 215 } 216 } 217 } 218 // Preliminary compliance status, will be updated when the 219 // payload/record is closed. 220 if (record.diagnostics.hasErrors() || record.diagnostics.hasWarnings()) { 221 record.bIsCompliant = false; 222 } else { 223 record.bIsCompliant = true; 224 } 225 reader.bIsCompliant &= record.bIsCompliant; 226 } else { 227 // In case no record is found the errors/warnings in the record 228 // object are transfered to the Reader. 229 reader.diagnostics.addAll(record.diagnostics); 230 if (record.diagnostics.hasErrors() || record.diagnostics.hasWarnings()) { 231 reader.errors += record.diagnostics.getErrors().size(); 232 reader.warnings += record.diagnostics.getWarnings().size(); 233 reader.bIsCompliant = false; 234 } 235 // Require one or more records to be present. 236 if (reader.records == 0) { 237 reader.diagnostics.addError(new Diagnosis(DiagnosisType.ERROR_EXPECTED, "WARC file", "One or more records")); 238 ++reader.errors; 239 reader.bIsCompliant = false; 240 } 241 // EOF 242 record = null; 243 } 244 return record; 245 } 246 247 /** 248 * Called when the payload object is closed and final steps in the 249 * validation process can be performed. 250 * @throws IOException i/o exception in final validation processing 251 */ 252 @Override 253 public void payloadClosed() throws IOException { 254 if (!bPayloadClosed) { 255 if (payload != null) { 256 // Check for truncated payload. 257 if (payload.getUnavailable() > 0) { 258 // Payload length mismatch - Payload truncated 259 addErrorDiagnosis(DiagnosisType.INVALID_DATA, "Payload length mismatch", "Payload truncated"); 260 } 261 /* 262 * Check block digest. 263 */ 264 byte[] digest = payload.getDigest(); 265 // Check for computed block digest. 266 if (digest != null) { 267 computedBlockDigest = new WarcDigest(); 268 computedBlockDigest.digestBytes = digest; 269 } 270 // Auto detect encoding used in WARC header. 271 if (header.warcBlockDigest != null && header.warcBlockDigest.digestString != null) { 272 isValidBlockDigest = processWarcDigest(header.warcBlockDigest, computedBlockDigest, "block"); 273 } 274 // Adjust information about computed block digest. 275 if (computedBlockDigest != null) { 276 processComputedDigest(computedBlockDigest, 277 reader.blockDigestAlgorithm, reader.blockDigestEncoding, "block"); 278 } 279 if (httpHeader != null && httpHeader.isValid()) { 280 /* 281 * Check payload digest. 282 */ 283 digest = httpHeader.getDigest(); 284 // Check for computed payload digest. 285 if (digest != null) { 286 computedPayloadDigest = new WarcDigest(); 287 computedPayloadDigest.digestBytes = digest; 288 } 289 // Auto detect encoding used in WARC header. 290 if (header.warcPayloadDigest != null && header.warcPayloadDigest.digestString != null ) { 291 isValidPayloadDigest = processWarcDigest(header.warcPayloadDigest, computedPayloadDigest, "payload"); 292 } 293 // Adjust information about computed payload digest. 294 if (computedPayloadDigest != null) { 295 processComputedDigest(computedPayloadDigest, 296 reader.payloadDigestAlgorithm, reader.payloadDigestEncoding, "payload"); 297 } 298 } 299 } 300 // Check for trailing newlines. 301 trailingNewlines = nlp.parseCRLFs(in, diagnostics); 302 if (trailingNewlines != WarcConstants.WARC_RECORD_TRAILING_NEWLINES) { 303 addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED, 304 "Trailing newlines", 305 Integer.toString(trailingNewlines), 306 Integer.toString(WarcConstants.WARC_RECORD_TRAILING_NEWLINES)); 307 } 308 // isCompliant status update. 309 if (diagnostics.hasErrors() || diagnostics.hasWarnings()) { 310 bIsCompliant = false; 311 reader.errors += diagnostics.getErrors().size(); 312 reader.warnings += diagnostics.getWarnings().size(); 313 } else { 314 bIsCompliant = true; 315 } 316 reader.bIsCompliant &= bIsCompliant; 317 // Updated consumed after payload has been consumed. 318 consumed = in.getConsumed() - startOffset; 319 // Don't not close payload again. 320 bPayloadClosed = true; 321 // Callback. 322 reader.recordClosed(); 323 } 324 } 325 326 /** 327 * Auto-detect encoding used in WARC digest header and compare it to the 328 * internal one, if it has been computed. 329 * @param warcDigest digest from WARC header 330 * @param computedDigest internally compute digest 331 * @param digestName used to identify the digest ("block" or "payload") 332 * @return WARC digest validity indication 333 */ 334 protected Boolean processWarcDigest(WarcDigest warcDigest, WarcDigest computedDigest, String digestName) { 335 byte[] digest; 336 Boolean isValidDigest = null; 337 int digestAlgorithmLength = WarcDigest.digestAlgorithmLength(warcDigest.algorithm); 338 digest = Base16.decodeToArray(warcDigest.digestString); 339 if (digest != null && digest.length == digestAlgorithmLength) { 340 warcDigest.digestBytes = digest; 341 warcDigest.encoding = "base16"; 342 } 343 if (warcDigest.digestBytes == null) { 344 digest = Base32.decodeToArray(warcDigest.digestString, true); 345 if (digest != null && digest.length == digestAlgorithmLength) { 346 warcDigest.digestBytes = digest; 347 warcDigest.encoding = "base32"; 348 } 349 if (warcDigest.digestBytes == null) { 350 digest = Base64.decodeToArray(warcDigest.digestString, true); 351 if (digest != null && digest.length == digestAlgorithmLength) { 352 warcDigest.digestBytes = digest; 353 warcDigest.encoding = "base64"; 354 } 355 } 356 } 357 if (warcDigest.encoding == null) { 358 // Encoding - Unrecognized block digest encoding scheme 359 addErrorDiagnosis(DiagnosisType.UNKNOWN, 360 "Record " + digestName + " digest encoding scheme", 361 warcDigest.digestString); 362 } 363 if (computedDigest != null) { 364 computedDigest.algorithm = warcDigest.algorithm; 365 computedDigest.encoding = warcDigest.encoding; 366 if (warcDigest.digestBytes != null) { 367 if (!Arrays.equals(computedDigest.digestBytes, warcDigest.digestBytes)) { 368 // Block digest - Computed block digest does not match 369 addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED, 370 "Incorrect " + digestName + " digest", 371 Base16.encodeArray(warcDigest.digestBytes), 372 Base16.encodeArray(computedDigest.digestBytes)); 373 isValidDigest = false; 374 } else { 375 isValidDigest = true; 376 } 377 } else { 378 isValidDigest = false; 379 } 380 } 381 return isValidDigest; 382 } 383 384 /** 385 * Adjust algorithm and encoding information about computed block digest. 386 * @param computedDigest internally compute digest 387 * @param digestAlgorithm default algorithm 388 * @param digestEncoding default encoding 389 * @param digestName used to identify the digest ("block" or "payload") 390 */ 391 protected void processComputedDigest(WarcDigest computedDigest, String digestAlgorithm, String digestEncoding, String digestName) { 392 if (computedDigest.algorithm == null) { 393 computedDigest.algorithm = digestAlgorithm; 394 } 395 if (computedDigest.encoding == null && digestEncoding != null) { 396 if ("base32".equals(digestEncoding)) { 397 computedDigest.encoding = "base32"; 398 } else if ("base64".equals(digestEncoding)) { 399 computedDigest.encoding = "base64"; 400 } else if ("base16".equals(digestEncoding)) { 401 computedDigest.encoding = "base16"; 402 } else { 403 // Encoding - Unknown block digest encoding scheme .. 404 addErrorDiagnosis(DiagnosisType.UNKNOWN, 405 "Default " + digestName + " digest encoding scheme", 406 digestEncoding); 407 } 408 } 409 if (computedDigest.encoding != null) { 410 if ("base32".equals(computedDigest.encoding)) { 411 computedDigest.digestString = Base32.encodeArray(computedDigest.digestBytes); 412 } else if ("base64".equals(computedDigest.encoding)) { 413 computedDigest.digestString = Base64.encodeArray(computedDigest.digestBytes); 414 } else if ("base16".equals(computedDigest.encoding)) { 415 computedDigest.digestString = Base16.encodeArray(computedDigest.digestBytes); 416 } 417 } 418 } 419 420 /** 421 * Check to see if the record has been closed. 422 * @return boolean indicating whether this record is closed or not 423 */ 424 public boolean isClosed() { 425 return bClosed; 426 } 427 428 /** 429 * Close resources associated with the WARC record. 430 * Mainly payload stream if any. 431 * @throws IOException if unable to close resources 432 */ 433 public void close() throws IOException { 434 if (!bClosed) { 435 // Ensure input stream is at the end of the record payload. 436 if (payload != null) { 437 payload.close(); 438 } 439 payloadClosed(); 440 reader = null; 441 in = null; 442 bClosed = true; 443 } 444 } 445 446 /** 447 * Returns a boolean indicating the ISO compliance status of this record. 448 * @return a boolean indicating the ISO compliance status of this record 449 */ 450 public boolean isCompliant() { 451 return bIsCompliant; 452 } 453 454 /** 455 * Get the record offset relative to the start of the WARC file 456 * <code>InputStream</code>. 457 * @return the record offset relative to the start of the WARC file 458 */ 459 public long getStartOffset() { 460 return header.startOffset; 461 } 462 463 /** 464 * Return number of uncompressed bytes consumed validating this record. 465 * @return number of uncompressed bytes consumed validating this record 466 */ 467 public long getConsumed() { 468 return consumed; 469 } 470 471 /** 472 * Get a <code>List</code> of all the non-standard WARC headers found 473 * during parsing. 474 * @return <code>List</code> of <code>HeaderLine</code> 475 */ 476 public List<HeaderLine> getHeaderList() { 477 return Collections.unmodifiableList(header.headerList); 478 } 479 480 /** 481 * Get a non-standard WARC header or null, if nothing is stored for this 482 * header name. 483 * @param field header name 484 * @return <code>HeaderLine</code> structure or null 485 */ 486 public HeaderLine getHeader(String field) { 487 if (field != null && field.length() > 0) { 488 return header.headerMap.get(field.toLowerCase()); 489 } else { 490 return null; 491 } 492 } 493 494 /** 495 * Specifies whether this record has a payload or not. 496 * @return true/false whether the ARC record has a payload 497 */ 498 public boolean hasPayload() { 499 return (payload != null); 500 } 501 502 /** 503 * Return Payload object. 504 * @return payload or <code>null</code> 505 */ 506 public Payload getPayload() { 507 return payload; 508 } 509 510 /** 511 * Payload content <code>InputStream</code> getter. 512 * @return Payload content <code>InputStream</code> 513 */ 514 public InputStream getPayloadContent() { 515 return (payload != null) ? payload.getInputStream() : null; 516 } 517 518 /** 519 * Returns the <code>HttpHeader</code> object if identified in the payload, 520 * or null. 521 * @return the <code>HttpHeader</code> object if identified or null 522 */ 523 public HttpHeader getHttpHeader() { 524 return httpHeader; 525 } 526 527 /** 528 * Add an error diagnosis of the given type on a specific entity with 529 * optional extra information. The information varies according to the 530 * diagnosis type. 531 * @param type diagnosis type 532 * @param entity entity examined 533 * @param information optional extra information 534 */ 535 protected void addErrorDiagnosis(DiagnosisType type, String entity, String... information) { 536 diagnostics.addError(new Diagnosis(type, entity, information)); 537 } 538 539}