001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import org.jwat.common.ByteCountingPushBackInputStream; 021import org.jwat.common.ContentType; 022import org.jwat.common.Diagnosis; 023import org.jwat.common.DiagnosisType; 024import org.jwat.common.Diagnostics; 025import org.jwat.common.HeaderLine; 026import org.jwat.common.MaxLengthRecordingInputStream; 027import org.jwat.common.Uri; 028import org.jwat.common.UriProfile; 029 030import java.io.ByteArrayOutputStream; 031import java.io.IOException; 032import java.net.InetAddress; 033import java.text.DateFormat; 034import java.util.Collections; 035import java.util.Date; 036import java.util.HashMap; 037import java.util.LinkedList; 038import java.util.List; 039import java.util.Map; 040 041/** 042 * Central class for working with WARC headers. This class includes support for 043 * reading and writing WARC headers. Methods are also available to validate 044 * individual headers and a WARC header as a whole. 045 * 046 * @author nicl 047 */ 048public class WarcHeader { 049 050 /** An URI with encapsulating <> characters. */ 051 public static final boolean URI_LTGT = true; 052 053 /** An URI without encapsulating <> characters. */ 054 public static final boolean URI_NAKED = false; 055 056 /** Associated WarcReader context. 057 * Must be set prior to calling the various methods. */ 058 protected WarcReader reader; 059 060 /** Diagnostics used to report diagnoses. 061 * Must be set prior to calling the various methods. */ 062 protected Diagnostics<Diagnosis> diagnostics; 063 064 /** WARC-Target-URI profile. */ 065 protected UriProfile warcTargetUriProfile; 066 067 /** URI profile. */ 068 protected UriProfile uriProfile; 069 070 /** WARC field parser used. 071 * Must be set prior to calling the various methods. */ 072 protected WarcFieldParsers fieldParsers; 073 074 /** WARC <code>DateFormat</code> as specified by the WARC ISO standard. */ 075 protected DateFormat warcDateFormat; 076 077 /** WARC record starting offset relative to the source WARC file input 078 * stream. The offset is correct for both compressed and uncompressed streams. */ 079 protected long startOffset = -1; 080 081 /* 082 * Version related fields. 083 */ 084 085 /** Was "WARC/" identified while looking for the version string. */ 086 public boolean bMagicIdentified; 087 /** Did the version string include between 2 and 4 substrings delimited by ".". */ 088 public boolean bVersionParsed; 089 /** Is the version format valid. */ 090 public boolean bValidVersionFormat; 091 /** Is the version recognized. (0.17, 0.18 or 1.0) */ 092 public boolean bValidVersion; 093 094 /** Raw version string. */ 095 public String versionStr; 096 /** Array based on the version string split by the "." delimiter and converted to integers. */ 097 public int[] versionArr; 098 099 /** Major version number from WARC header. */ 100 public int major = -1; 101 /** Minor version number from WARC header. */ 102 public int minor = -1; 103 104 /* 105 * WARC header fields. 106 */ 107 108 /** Array used for duplicate header detection. */ 109 protected boolean[] seen = new boolean[WarcConstants.FN_INDEX_OF_LAST]; 110 111 /** Is the header missing one of the mandatory headers. */ 112 public boolean bMandatoryMissing; 113 114 /** WARC-Type field string value. */ 115 public String warcTypeStr; 116 /** WARC-Type converted to an integer id, if identified. */ 117 public Integer warcTypeIdx; 118 119 /** WARC-Filename field string value. 120 * (warcinfo record type only) */ 121 public String warcFilename; 122 123 /** WARC-Record-Id field string value. */ 124 public String warcRecordIdStr; 125 /** WARC-Record-Id converted to an <code>Uri</code> object, if valid. */ 126 public Uri warcRecordIdUri; 127 128 /** WARC-Date field string value. */ 129 public String warcDateStr; 130 /** WARC-Date converted to a <code>Date</code> object, if valid. */ 131 public Date warcDate; 132 133 /** Content-Length field string value. */ 134 public String contentLengthStr; 135 /** Content-Length converted to a <code>Long</code> object, if valid. */ 136 public Long contentLength; 137 138 /** Content-Type field string value. */ 139 public String contentTypeStr; 140 /** Content-Type converted to a <code>ContentType</code> object, if valid. */ 141 public ContentType contentType; 142 143 /** WARC-Truncated field string value. */ 144 public String warcTruncatedStr; 145 /** WARC-Truncated converted to an integer id, if valid. */ 146 public Integer warcTruncatedIdx; 147 148 /** WARC-IP-Address field string value. */ 149 public String warcIpAddress; 150 /** WARC-IP-Address converted to an <code>InetAddress</code> object, if valid. */ 151 public InetAddress warcInetAddress; 152 153 /** List of WARC-Concurrent-To field string values and converted <code>URI</code> objects, if valid. */ 154 public List<WarcConcurrentTo> warcConcurrentToList = new LinkedList<WarcConcurrentTo>(); 155 156 /** WARC-Refers-To field string value. */ 157 public String warcRefersToStr; 158 /** WARC-Refers-To converted to an <code>Uri</code> object, if valid. */ 159 public Uri warcRefersToUri; 160 161 /** WARC_Target-URI field string value. */ 162 public String warcTargetUriStr; 163 /** WARC-TargetURI converted to an <code>Uri</code> object, if valid. */ 164 public Uri warcTargetUriUri; 165 166 /** WARC-Warcinfo-Id field string value. */ 167 public String warcWarcinfoIdStr; 168 /** WARC-Warcinfo-Id converted to an <code>Uri</code> object, if valid. */ 169 public Uri warcWarcinfoIdUri; 170 171 /** WARC-Block-Digest field string value. */ 172 public String warcBlockDigestStr; 173 /** WARC-Block-Digest converted to a <code>WarcDigest</code> object, if valid. */ 174 public WarcDigest warcBlockDigest; 175 176 /** WARC-Payload-Digest field string value. */ 177 public String warcPayloadDigestStr; 178 /** WARC-Payload-Digest converted to a <code>WarcDigest</code> object, if valid. */ 179 public WarcDigest warcPayloadDigest; 180 181 /** WARC-Identified-Payload-Type field string value. */ 182 public String warcIdentifiedPayloadTypeStr; 183 /** WARC-Identified-Payload-Type converted to a <code>ContentType</code> object, if valid. */ 184 public ContentType warcIdentifiedPayloadType; 185 186 /** WARC-Profile field string value. 187 * (revisit record only) */ 188 public String warcProfileStr; 189 /** WARC-Profile field converted to an <code>Uri</code> object, if valid. 190 * (revisit record only) */ 191 public Uri warcProfileUri; 192 /** WARC-Profile converted to an integer id, if valid. 193 * (revisit record only) */ 194 public Integer warcProfileIdx; 195 196 /** WARC-Segment-Number field string value. */ 197 public String warcSegmentNumberStr; 198 /** WARC-Segment-Number converted to an <code>Integer</code> object, if valid. */ 199 public Integer warcSegmentNumber; 200 201 /** WARC-Segment-Origin-Id field string value. 202 * (continuation record only) */ 203 public String warcSegmentOriginIdStr; 204 /** WARC-Segment-Origin-Id converted to an <code>Uri</code> object, if valid. 205 * (continuation record only) */ 206 public Uri warcSegmentOriginIdUrl; 207 208 /** WARC-Segment-Total-Length field string value. 209 * (continuation record only) */ 210 public String warcSegmentTotalLengthStr; 211 /** WARC-Segment-Total-Length converted to a <code>Long</code> object, if valid. 212 * (continuation record only) */ 213 public Long warcSegmentTotalLength; 214 215 // see https://docs.google.com/document/d/1QyQBA7Ykgxie75V8Jziz_O7hbhwf7PF6_u9O6w6zgp0/edit 216 /** WARC-Refers-To-Target-URI field string value. */ 217 public String warcRefersToTargetUriStr; 218 /** WARC-Refers-To-Target-URI converted to an <code>Uri</code> object, if valid. */ 219 public Uri warcRefersToTargetUriUri; 220 /** WARC-Refers-To-Date */ 221 public String warcRefersToDateStr; 222 /** WARC-Date converted to a <code>Date</code> object, if valid. */ 223 public Date warcRefersToDate; 224 225 /* 226 * WARC header fields collections. 227 */ 228 229 /** Raw WARC header output stream. */ 230 protected ByteArrayOutputStream headerBytesOut = new ByteArrayOutputStream(); 231 232 /** Raw WARC header byte array. */ 233 public byte[] headerBytes; 234 235 /** List of parsed header fields. */ 236 protected List<HeaderLine> headerList = new LinkedList<HeaderLine>(); 237 238 /** Map of parsed header fields. */ 239 protected Map<String, HeaderLine> headerMap = new HashMap<String, HeaderLine>(); 240 241 /** 242 * Non public constructor to allow unit testing. 243 */ 244 protected WarcHeader() { 245 } 246 247 /** 248 * Create and initialize a new <code>WarcHeader</code> for writing. 249 * @param writer writer which shall be used 250 * @param diagnostics diagnostics object used by writer 251 * @return a <code>WarcHeader</code> prepared for writing 252 */ 253 public static WarcHeader initHeader(WarcWriter writer, Diagnostics<Diagnosis> diagnostics) { 254 WarcHeader header = new WarcHeader(); 255 // Set default version to "1.0". 256 header.major = 1; 257 header.minor = 0; 258 header.warcTargetUriProfile = writer.warcTargetUriProfile; 259 header.uriProfile = writer.uriProfile; 260 header.fieldParsers = writer.fieldParsers; 261 header.warcDateFormat = writer.warcDateFormat; 262 header.diagnostics = diagnostics; 263 return header; 264 } 265 266 /** 267 * Create and initialize a new <code>WarcHeader</code> for reading. 268 * @param reader reader which shall be used 269 * @param startOffset start offset of header 270 * @param diagnostics diagnostics object used by reader 271 * @return a <code>WarcHeader</code> prepared for reading 272 */ 273 public static WarcHeader initHeader(WarcReader reader, long startOffset, Diagnostics<Diagnosis> diagnostics) { 274 WarcHeader header = new WarcHeader(); 275 header.reader = reader; 276 header.warcTargetUriProfile = reader.warcTargetUriProfile; 277 header.uriProfile = reader.uriProfile; 278 header.fieldParsers = reader.fieldParsers; 279 header.diagnostics = diagnostics; 280 // This is only relevant for uncompressed sequentially read records 281 header.startOffset = startOffset; 282 return header; 283 } 284 285 /** 286 * Add an error diagnosis of the given type on a specific entity with 287 * optional extra information. The information varies according to the 288 * diagnosis type. 289 * @param type diagnosis type 290 * @param entity entity examined 291 * @param information optional extra information 292 */ 293 protected void addErrorDiagnosis(DiagnosisType type, String entity, String... information) { 294 diagnostics.addError(new Diagnosis(type, entity, information)); 295 } 296 297 /** 298 * Add a warning diagnosis of the given type on a specific entity with 299 * optional extra information. The information varies according to the 300 * diagnosis type. 301 * @param type diagnosis type 302 * @param entity entity examined 303 * @param information optional extra information 304 */ 305 protected void addWarningDiagnosis(DiagnosisType type, String entity, String... information) { 306 diagnostics.addWarning(new Diagnosis(type, entity, information)); 307 } 308 309 /** 310 * Returns the starting offset of the record in the containing WARC. 311 * @return the starting offset of the record 312 */ 313 public long getStartOffset() { 314 return startOffset; 315 } 316 317 /** 318 * Try to parse a WARC header and return a boolean indicating the success or 319 * failure of this. 320 * @param in input stream with WARC data 321 * @return boolean indicating whether a header was parsed or not 322 * @throws IOException if an i/o exception occurs while parsing for a header 323 */ 324 public boolean parseHeader(ByteCountingPushBackInputStream in) throws IOException { 325 if (parseVersion(in)) { 326 // debug 327 //System.out.println(wr.bMagicIdentified); 328 //System.out.println(wr.bVersionParsed); 329 //System.out.println(wr.major + "." + wr.minor); 330 if (bVersionParsed && versionArr.length == 2) { 331 switch (major) { 332 case 1: 333 if (minor == 0) { 334 bValidVersion = true; 335 } 336 break; 337 case 0: 338 switch (minor) { 339 case 17: 340 case 18: 341 bValidVersion = true; 342 break; 343 } 344 break; 345 default: 346 break; 347 } 348 if (!bValidVersion) { 349 diagnostics.addError( 350 new Diagnosis(DiagnosisType.UNKNOWN, 351 "Magic version number", versionStr)); 352 } 353 } else { 354 diagnostics.addError( 355 new Diagnosis(DiagnosisType.INVALID_DATA, 356 "Magic Version string", versionStr)); 357 } 358 359 MaxLengthRecordingInputStream mrin = new MaxLengthRecordingInputStream(in, reader.recordHeaderMaxSize); 360 ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(mrin, reader.recordHeaderMaxSize); 361 362 parseHeaders(pbin); 363 pbin.close(); 364 365 checkFields(); 366 367 headerBytes = headerBytesOut.toByteArray(); 368 } 369 return bMagicIdentified; 370 } 371 372 /** 373 * Looks forward in the input stream for a valid WARC version line. 374 * @param in data input stream 375 * @return true, if magic WARC header found 376 * @throws IOException if an error occurs while reading version data 377 */ 378 protected boolean parseVersion(ByteCountingPushBackInputStream in) throws IOException { 379 bMagicIdentified = false; 380 bVersionParsed = false; 381 boolean bInvalidDataBeforeVersion = false; 382 boolean bEmptyLinesBeforeVersion = false; 383 HeaderLine line; 384 String tmpStr; 385 boolean bSeekMagic = true; 386 // Loop until when have found something that looks like a version line. 387 while (bSeekMagic) { 388 // This is only relevant for uncompressed sequentially read records 389 startOffset = in.getConsumed(); 390 line = reader.lineReader.readLine(in); 391 if (!reader.lineReader.bEof) { 392 switch (line.type) { 393 case HeaderLine.HLT_LINE: 394 tmpStr = line.line; 395 // debug 396 //System.out.println(tmpStr); 397 if (tmpStr.length() > 0) { 398 if (tmpStr.toUpperCase().startsWith(WarcConstants.WARC_MAGIC_HEADER)) { 399 bMagicIdentified = true; 400 versionStr = tmpStr.substring(WarcConstants.WARC_MAGIC_HEADER.length()); 401 String[] tmpArr = versionStr.split("\\.", -1); // Not optimal 402 if (tmpArr.length >= 2 && tmpArr.length <= 4) { 403 bVersionParsed = true; 404 bValidVersionFormat = true; 405 versionArr = new int[tmpArr.length]; 406 for (int i=0; i<tmpArr.length; ++i) { 407 try { 408 versionArr[i] = Integer.parseInt(tmpArr[i]); 409 } catch (NumberFormatException e) { 410 versionArr[i] = -1; 411 bValidVersionFormat = false; 412 } 413 } 414 major = versionArr[0]; 415 minor = versionArr[1]; 416 } 417 headerBytesOut.write(line.raw); 418 bSeekMagic = false; 419 } else { 420 // Invalid data aka Gibberish. 421 bInvalidDataBeforeVersion = true; 422 } 423 } else { 424 // Empty line. 425 bEmptyLinesBeforeVersion = true; 426 427 } 428 break; 429 case HeaderLine.HLT_HEADERLINE: 430 // Invalid data - header or binary. 431 bInvalidDataBeforeVersion = true; 432 break; 433 } 434 } else { 435 // EOF. 436 bSeekMagic = false; 437 } 438 } 439 if (bInvalidDataBeforeVersion) { 440 addErrorDiagnosis(DiagnosisType.INVALID, "Data before WARC version"); 441 } 442 if (bEmptyLinesBeforeVersion) { 443 addErrorDiagnosis(DiagnosisType.INVALID, "Empty lines before WARC version"); 444 } 445 return bMagicIdentified; 446 } 447 448 /** 449 * Reads WARC header lines one line at a time until an empty line is 450 * encountered. 451 * @param in header input stream 452 * @throws IOException if an error occurs while reading the WARC header 453 */ 454 protected void parseHeaders(ByteCountingPushBackInputStream in) throws IOException { 455 HeaderLine headerLine; 456 boolean bLoop = true; 457 while (bLoop) { 458 headerLine = reader.headerLineReader.readLine(in); 459 if (!reader.headerLineReader.bEof) { 460 headerBytesOut.write(headerLine.raw); 461 switch (headerLine.type) { 462 case HeaderLine.HLT_HEADERLINE: 463 if (headerLine.name != null && headerLine.name.length() > 0) { 464 // debug 465 //System.out.println(headerLine.name); 466 //System.out.println(headerLine.value); 467 addHeader(headerLine); 468 } else { 469 // Empty field name. 470 addWarningDiagnosis(DiagnosisType.EMPTY, "Header line"); 471 } 472 break; 473 case HeaderLine.HLT_LINE: 474 if (headerLine.line.length() == 0) { 475 // Empty line. 476 bLoop = false; 477 } else { 478 // Unknown header line. 479 addWarningDiagnosis(DiagnosisType.UNKNOWN, "Header line", headerLine.line); 480 } 481 break; 482 default: 483 throw new IllegalStateException("Invalid HeaderLine output!"); 484 } 485 } else { 486 // EOF. 487 bLoop = false; 488 } 489 } 490 } 491 492 /** 493 * Identify a (WARC) header name, validate the value and set the header. 494 * @param headerLine the headerLine 495 */ 496 protected void addHeader(HeaderLine headerLine) { 497 String fieldName = headerLine.name; 498 String fieldValue = headerLine.value; 499 WarcConcurrentTo warcConcurrentTo; 500 Integer fn_idx = WarcConstants.fieldNameIdxMap.get(fieldName.toLowerCase()); 501 if (fn_idx != null) { 502 // WARC field name defined in WARC specification. 503 if (!seen[fn_idx] || WarcConstants.fieldNamesRepeatableLookup[fn_idx]) { 504 seen[fn_idx] = true; 505 switch (fn_idx.intValue()) { 506 case WarcConstants.FN_IDX_WARC_TYPE: 507 warcTypeStr = fieldParsers.parseString(fieldValue, 508 WarcConstants.FN_WARC_TYPE); 509 if (warcTypeStr != null) { 510 warcTypeIdx = WarcConstants.recordTypeIdxMap.get(warcTypeStr.toLowerCase()); 511 } 512 if (warcTypeIdx == null && warcTypeStr != null && warcTypeStr.length() > 0) { 513 warcTypeIdx = WarcConstants.RT_IDX_UNKNOWN; 514 } 515 break; 516 case WarcConstants.FN_IDX_WARC_RECORD_ID: 517 warcRecordIdStr = fieldValue; 518 warcRecordIdUri = fieldParsers.parseUri(fieldValue, URI_LTGT, 519 uriProfile, WarcConstants.FN_WARC_RECORD_ID); 520 break; 521 case WarcConstants.FN_IDX_WARC_DATE: 522 warcDateStr = fieldValue; 523 warcDate = fieldParsers.parseDate(fieldValue, 524 WarcConstants.FN_WARC_DATE); 525 break; 526 case WarcConstants.FN_IDX_CONTENT_LENGTH: 527 contentLengthStr = fieldValue; 528 contentLength = fieldParsers.parseLong(fieldValue, 529 WarcConstants.FN_CONTENT_LENGTH); 530 break; 531 case WarcConstants.FN_IDX_CONTENT_TYPE: 532 contentTypeStr = fieldValue; 533 contentType = fieldParsers.parseContentType(fieldValue, 534 WarcConstants.FN_CONTENT_TYPE); 535 break; 536 case WarcConstants.FN_IDX_WARC_CONCURRENT_TO: 537 Uri tmpUri = fieldParsers.parseUri(fieldValue, URI_LTGT, 538 uriProfile, WarcConstants.FN_WARC_CONCURRENT_TO); 539 if (fieldValue != null && fieldValue.trim().length() > 0) { 540 warcConcurrentTo = new WarcConcurrentTo(); 541 warcConcurrentTo.warcConcurrentToStr = fieldValue; 542 warcConcurrentTo.warcConcurrentToUri = tmpUri; 543 warcConcurrentToList.add(warcConcurrentTo); 544 } 545 break; 546 case WarcConstants.FN_IDX_WARC_BLOCK_DIGEST: 547 warcBlockDigestStr = fieldValue; 548 warcBlockDigest = fieldParsers.parseDigest(fieldValue, 549 WarcConstants.FN_WARC_BLOCK_DIGEST); 550 break; 551 case WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST: 552 warcPayloadDigestStr = fieldValue; 553 warcPayloadDigest = fieldParsers.parseDigest(fieldValue, 554 WarcConstants.FN_WARC_PAYLOAD_DIGEST); 555 break; 556 case WarcConstants.FN_IDX_WARC_IP_ADDRESS: 557 warcIpAddress = fieldValue; 558 warcInetAddress = fieldParsers.parseIpAddress(fieldValue, 559 WarcConstants.FN_WARC_IP_ADDRESS); 560 break; 561 case WarcConstants.FN_IDX_WARC_REFERS_TO: 562 warcRefersToStr = fieldValue; 563 warcRefersToUri = fieldParsers.parseUri(fieldValue, URI_LTGT, 564 uriProfile, WarcConstants.FN_WARC_REFERS_TO); 565 break; 566 case WarcConstants.FN_IDX_WARC_TARGET_URI: 567 warcTargetUriStr = fieldValue; 568 warcTargetUriUri = fieldParsers.parseUri(fieldValue, URI_NAKED, 569 warcTargetUriProfile, WarcConstants.FN_WARC_TARGET_URI); 570 break; 571 case WarcConstants.FN_IDX_WARC_TRUNCATED: 572 warcTruncatedStr = fieldParsers.parseString(fieldValue, 573 WarcConstants.FN_WARC_TRUNCATED); 574 if (warcTruncatedStr != null) { 575 warcTruncatedIdx = WarcConstants.truncatedTypeIdxMap.get(warcTruncatedStr.toLowerCase()); 576 } 577 if (warcTruncatedIdx == null && warcTruncatedStr != null && warcTruncatedStr.length() > 0) { 578 warcTruncatedIdx = WarcConstants.TT_IDX_FUTURE_REASON; 579 } 580 break; 581 case WarcConstants.FN_IDX_WARC_WARCINFO_ID: 582 warcWarcinfoIdStr = fieldValue; 583 warcWarcinfoIdUri = fieldParsers.parseUri(fieldValue, URI_LTGT, 584 uriProfile, WarcConstants.FN_WARC_WARCINFO_ID); 585 break; 586 case WarcConstants.FN_IDX_WARC_FILENAME: 587 warcFilename = fieldParsers.parseString(fieldValue, 588 WarcConstants.FN_WARC_FILENAME); 589 break; 590 case WarcConstants.FN_IDX_WARC_PROFILE: 591 warcProfileStr = fieldValue; 592 warcProfileUri = fieldParsers.parseUri(fieldValue, URI_NAKED, 593 uriProfile, WarcConstants.FN_WARC_PROFILE); 594 if (warcProfileStr != null) { 595 warcProfileIdx = WarcConstants.profileIdxMap.get(warcProfileStr.toLowerCase()); 596 } 597 if (warcProfileIdx == null && warcProfileStr != null && warcProfileStr.length() > 0) { 598 warcProfileIdx = WarcConstants.PROFILE_IDX_UNKNOWN; 599 } 600 break; 601 case WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE: 602 warcIdentifiedPayloadTypeStr = fieldValue; 603 warcIdentifiedPayloadType = fieldParsers.parseContentType(fieldValue, 604 WarcConstants.FN_WARC_IDENTIFIED_PAYLOAD_TYPE); 605 break; 606 case WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID: 607 warcSegmentOriginIdStr = fieldValue; 608 warcSegmentOriginIdUrl = fieldParsers.parseUri(fieldValue, URI_LTGT, 609 uriProfile, WarcConstants.FN_WARC_SEGMENT_ORIGIN_ID); 610 break; 611 case WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER: 612 warcSegmentNumberStr = fieldValue; 613 warcSegmentNumber = fieldParsers.parseInteger(fieldValue, 614 WarcConstants.FN_WARC_SEGMENT_NUMBER); 615 break; 616 case WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH: 617 warcSegmentTotalLengthStr = fieldValue; 618 warcSegmentTotalLength = fieldParsers.parseLong(fieldValue, 619 WarcConstants.FN_WARC_SEGMENT_TOTAL_LENGTH); 620 break; 621 case WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI: 622 warcRefersToTargetUriStr = fieldValue; 623 warcRefersToTargetUriUri = fieldParsers.parseUri(fieldValue, URI_NAKED, 624 uriProfile, WarcConstants.FN_WARC_REFERS_TO_TARGET_URI); 625 break; 626 case WarcConstants.FN_IDX_WARC_REFERS_TO_DATE: 627 warcRefersToDateStr = fieldValue; 628 warcRefersToDate = fieldParsers.parseDate(fieldValue, 629 WarcConstants.FN_WARC_REFERS_TO_DATE); 630 break; 631 } 632 } else { 633 // Duplicate field. 634 addErrorDiagnosis(DiagnosisType.DUPLICATE, "'" + fieldName + "' header", fieldValue); 635 } 636 } 637 HeaderLine tmpLine = headerMap.get(fieldName.toLowerCase()); 638 if (tmpLine == null) { 639 headerMap.put(fieldName.toLowerCase(), headerLine); 640 } else { 641 tmpLine.lines.add(headerLine); 642 } 643 headerList.add(headerLine); 644 } 645 646 /** 647 * Get a <code>List</code> of all the headers found during parsing. 648 * @return <code>List</code> of <code>HeaderLine</code> 649 */ 650 public List<HeaderLine> getHeaderList() { 651 return Collections.unmodifiableList(headerList); 652 } 653 654 /** 655 * Get a header line structure or null, if no header line structure is 656 * stored with the given header name. 657 * @param field header name 658 * @return <code>HeaderLine</code> structure or null 659 */ 660 public HeaderLine getHeader(String field) { 661 if (field != null && field.length() > 0) { 662 return headerMap.get(field.toLowerCase()); 663 } else { 664 return null; 665 } 666 } 667 668 /** 669 * Add a String header using the supplied string and return a 670 * <code>HeaderLine</code> object corresponding to how the header would be 671 * read. 672 * @param fieldName name of field to add 673 * @param fieldValue field value string 674 * @return <code>HeaderLine</code> object corresponding to what would have been read 675 */ 676 public HeaderLine addHeader(String fieldName, String fieldValue) { 677 HeaderLine headerLine = new HeaderLine(); 678 headerLine.name = fieldName; 679 headerLine.value = fieldValue; 680 addHeader(headerLine); 681 return headerLine; 682 } 683 684 /** 685 * Add an Integer header using the supplied string and object values and return 686 * a <code>HeaderLine</code> object corresponding to how the header would be read. 687 * If both string and object values are not null they are used as is. 688 * If the string value is null and the object is not null, 689 * the object's toString method is called. 690 * If the object is null and the string is not null, the string is parsed 691 * and validated resulting in an object, if valid. 692 * @param fieldName name of field to add 693 * @param integerFieldValue <code>Integer</code> field value object 694 * @param fieldValueStr Integer field value string 695 * @return <code>HeaderLine</code> object corresponding to what would have been read 696 */ 697 public HeaderLine addHeader(String fieldName, Integer integerFieldValue, String fieldValueStr) { 698 if (integerFieldValue == null && fieldValueStr != null) { 699 integerFieldValue = fieldParsers.parseInteger(fieldValueStr, fieldName); 700 } else if (fieldValueStr == null && integerFieldValue != null) { 701 fieldValueStr = integerFieldValue.toString(); 702 } 703 return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_INTEGER, 704 integerFieldValue, null, null, null, null, null, null); 705 } 706 707 /** 708 * Add a Long header using the supplied string and object values and return 709 * a <code>HeaderLine</code> object corresponding to how the header would be read. 710 * If both string and object values are not null they are used as is. 711 * If the string value is null and the object is not null, 712 * the object's toString method is called. 713 * If the object is null and the string is not null, the string is parsed 714 * and validated resulting in an object, if valid. 715 * @param fieldName name of field to add 716 * @param longFieldValue <code>Long</code> field value object 717 * @param fieldValueStr Long field value string 718 * @return <code>HeaderLine</code> object corresponding to what would have been read 719 */ 720 public HeaderLine addHeader(String fieldName, Long longFieldValue, String fieldValueStr) { 721 if (longFieldValue == null && fieldValueStr != null) { 722 longFieldValue = fieldParsers.parseLong(fieldValueStr, fieldName); 723 } else if (fieldValueStr == null && longFieldValue != null) { 724 fieldValueStr = longFieldValue.toString(); 725 } 726 return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_LONG, 727 null, longFieldValue, null, null, null, null, null); 728 } 729 730 /** 731 * Add an Digest header using the supplied string and object values and return 732 * a <code>HeaderLine</code> object corresponding to how the header would be read. 733 * If both string and object values are not null they are used as is. 734 * If the string value is null and the object is not null, 735 * the object's toString method is called. 736 * If the object is null and the string is not null, the string is parsed 737 * and validated resulting in an object, if valid. 738 * @param fieldName name of field to add 739 * @param digestFieldValue <code>Digest</code> field value object 740 * @param fieldValueStr Digest field value string 741 * @return <code>HeaderLine</code> object corresponding to what would have been read 742 */ 743 public HeaderLine addHeader(String fieldName, WarcDigest digestFieldValue, String fieldValueStr) { 744 if (digestFieldValue == null && fieldValueStr != null) { 745 digestFieldValue = fieldParsers.parseDigest(fieldValueStr, fieldName); 746 } else if (fieldValueStr == null && digestFieldValue != null) { 747 fieldValueStr = digestFieldValue.toString(); 748 } 749 return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_DIGEST, 750 null, null, digestFieldValue, null, null, null, null); 751 } 752 753 /** 754 * Add an Content-Type header using the supplied string and object values and return 755 * a <code>HeaderLine</code> object corresponding to how the header would be read. 756 * If both string and object values are not null they are used as is. 757 * If the string value is null and the object is not null, 758 * the object's toString method is called. 759 * If the object is null and the string is not null, the string is parsed 760 * and validated resulting in an object, if valid. 761 * @param fieldName name of field to add 762 * @param contentTypeFieldValue <code>ContentType</code> field value object 763 * @param fieldValueStr Content-Type field value string 764 * @return <code>HeaderLine</code> object corresponding to what would have been read 765 */ 766 public HeaderLine addHeader(String fieldName, ContentType contentTypeFieldValue, String fieldValueStr) { 767 if (contentTypeFieldValue == null && fieldValueStr != null) { 768 contentTypeFieldValue = fieldParsers.parseContentType(fieldValueStr, fieldName); 769 } else if (fieldValueStr == null && contentTypeFieldValue != null) { 770 fieldValueStr = contentTypeFieldValue.toString(); 771 } 772 return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_CONTENTTYPE, 773 null, null, null, contentTypeFieldValue, null, null, null); 774 } 775 776 /** 777 * Add an Date header using the supplied string and object values and return 778 * a <code>HeaderLine</code> object corresponding to how the header would be read. 779 * If both string and object values are not null they are used as is. 780 * If the string value is null and the object is not null, 781 * the object's toString method is called. 782 * If the object is null and the string is not null, the string is parsed 783 * and validated resulting in an object, if valid. 784 * @param fieldName name of field to add 785 * @param dateFieldValue <code>Date</code> field value object 786 * @param fieldValueStr Date field value string 787 * @return <code>HeaderLine</code> object corresponding to what would have been read 788 */ 789 public HeaderLine addHeader(String fieldName, Date dateFieldValue, String fieldValueStr) { 790 if (dateFieldValue == null && fieldValueStr != null) { 791 dateFieldValue = fieldParsers.parseDate(fieldValueStr, fieldName); 792 } else if (fieldValueStr == null && dateFieldValue != null) { 793 fieldValueStr = warcDateFormat.format(dateFieldValue); 794 } 795 return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_DATE, 796 null, null, null, null, dateFieldValue, null, null); 797 } 798 799 /** 800 * Add an InetAddress header using the supplied string and object values and return 801 * a <code>HeaderLine</code> object corresponding to how the header would be read. 802 * If both string and object values are not null they are used as is. 803 * If the string value is null and the object is not null, 804 * the object's toString method is called. 805 * If the object is null and the string is not null, the string is parsed 806 * and validated resulting in an object, if valid. 807 * @param fieldName name of field to add 808 * @param inetAddrFieldValue <code>InetAddress</code> field value object 809 * @param fieldValueStr IP-Address field value string 810 * @return <code>HeaderLine</code> object corresponding to what would have been read 811 */ 812 public HeaderLine addHeader(String fieldName, InetAddress inetAddrFieldValue, String fieldValueStr) { 813 if (inetAddrFieldValue == null && fieldValueStr != null) { 814 inetAddrFieldValue = fieldParsers.parseIpAddress(fieldValueStr, fieldName); 815 } else if (fieldValueStr == null && inetAddrFieldValue != null) { 816 fieldValueStr = inetAddrFieldValue.getHostAddress(); 817 } 818 return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_INETADDRESS, 819 null, null, null, null, null, inetAddrFieldValue, null); 820 } 821 822 /** 823 * Add an URI header using the supplied string and object values and return 824 * a <code>HeaderLine</code> object corresponding to how the header would be read. 825 * If both string and object values are not null they are used as is. 826 * If the string value is null and the object is not null, 827 * the object's toString method is called. 828 * If the object is null and the string is not null, the string is parsed 829 * and validated resulting in an object, if valid. 830 * @param fieldName name of field to add 831 * @param uriFieldValue <code>URI</code> field value object 832 * @param fieldValueStr URI field value string 833 * @return <code>HeaderLine</code> object corresponding to what would have been read 834 */ 835 public HeaderLine addHeader(String fieldName, Uri uriFieldValue, String fieldValueStr) { 836 if (uriFieldValue == null && fieldValueStr != null) { 837 if (WarcConstants.FN_WARC_TARGET_URI.equalsIgnoreCase(fieldName)) { 838 uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, warcTargetUriProfile, fieldName); 839 } else if (WarcConstants.FN_WARC_PROFILE.equalsIgnoreCase(fieldName)) { 840 uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, uriProfile, fieldName); 841 } else if (WarcConstants.FN_WARC_REFERS_TO_TARGET_URI.equalsIgnoreCase(fieldName)) { 842 uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, warcTargetUriProfile, fieldName); 843 } else { 844 uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_LTGT, uriProfile, fieldName); 845 } 846 } else if (fieldValueStr == null && uriFieldValue != null) { 847 if (WarcConstants.FN_WARC_TARGET_URI.equalsIgnoreCase(fieldName) 848 || WarcConstants.FN_WARC_PROFILE.equalsIgnoreCase(fieldName) 849 || WarcConstants.FN_WARC_REFERS_TO_TARGET_URI.equalsIgnoreCase(fieldName)) { 850 fieldValueStr = uriFieldValue.toString(); 851 } else { 852 fieldValueStr = "<" + uriFieldValue.toString() + ">"; 853 } 854 } 855 return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_URI, 856 null, null, null, null, null, null, uriFieldValue); 857 } 858 859 /** 860 * Add a header with the supplied field name, data type and value and 861 * return a <code>HeaderLine</code> corresponding to how the header will 862 * be read. The data type is validated against the field data type. 863 * The values used are the field value string and the parameter 864 * corresponding to the data type. 865 * @param fieldName header field name 866 * @param fieldValueStr field value in string form 867 * @param dt data type of the field value string when converted to an object 868 * @param integerFieldValue <code>Integer</code> object field value 869 * @param longFieldValue <code>Long</code> object field value 870 * @param digestFieldValue <code>Digest</code> object field value 871 * @param contentTypeFieldValue <code>ContentType</code> object field value 872 * @param dateFieldValue <code>Date</code> object field value 873 * @param inetAddrFieldValue <code>InetAddress</code> object field value 874 * @param uriFieldValue <code>URI</code> object field value 875 * @return <code>HeaderLine</code> object corresponding to what would have been read 876 */ 877 public HeaderLine addHeader(String fieldName, String fieldValueStr, int dt, 878 Integer integerFieldValue, Long longFieldValue, 879 WarcDigest digestFieldValue, ContentType contentTypeFieldValue, 880 Date dateFieldValue, InetAddress inetAddrFieldValue, 881 Uri uriFieldValue) { 882 Integer fn_idx = WarcConstants.fieldNameIdxMap.get(fieldName.toLowerCase()); 883 if (fn_idx != null) { 884 // Implicit cast from integer to long, if needed. 885 if (WarcConstants.FN_IDX_DT[fn_idx] == WarcConstants.FDT_LONG 886 && dt == WarcConstants.FDT_INTEGER) { 887 longFieldValue = (long)integerFieldValue; 888 dt = WarcConstants.FDT_LONG; 889 } 890 if (dt == WarcConstants.FN_IDX_DT[fn_idx]) { 891 // WARC field name defined in WARC specification. 892 if (seen[fn_idx] && !WarcConstants.fieldNamesRepeatableLookup[fn_idx]) { 893 // Duplicate field. 894 addErrorDiagnosis(DiagnosisType.DUPLICATE, 895 "'" + fieldName + "' header", 896 fieldValueStr); 897 } 898 seen[fn_idx] = true; 899 switch (fn_idx.intValue()) { 900 /* 901 * Integer. 902 */ 903 case WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER: 904 warcSegmentNumberStr = fieldValueStr; 905 warcSegmentNumber = integerFieldValue; 906 break; 907 /* 908 * Long. 909 */ 910 case WarcConstants.FN_IDX_CONTENT_LENGTH: 911 contentLengthStr = fieldValueStr; 912 contentLength = longFieldValue; 913 break; 914 case WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH: 915 warcSegmentTotalLengthStr = fieldValueStr; 916 warcSegmentTotalLength = longFieldValue; 917 break; 918 /* 919 * Digest. 920 */ 921 case WarcConstants.FN_IDX_WARC_BLOCK_DIGEST: 922 warcBlockDigestStr = fieldValueStr; 923 warcBlockDigest = digestFieldValue; 924 break; 925 case WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST: 926 warcPayloadDigestStr = fieldValueStr; 927 warcPayloadDigest = digestFieldValue; 928 break; 929 /* 930 * ContentType. 931 */ 932 case WarcConstants.FN_IDX_CONTENT_TYPE: 933 contentTypeStr = fieldValueStr; 934 contentType = contentTypeFieldValue; 935 break; 936 case WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE: 937 warcIdentifiedPayloadTypeStr = fieldValueStr; 938 warcIdentifiedPayloadType = contentTypeFieldValue; 939 break; 940 /* 941 * Date. 942 */ 943 case WarcConstants.FN_IDX_WARC_DATE: 944 warcDateStr = fieldValueStr; 945 warcDate = dateFieldValue; 946 break; 947 case WarcConstants.FN_IDX_WARC_REFERS_TO_DATE: 948 warcRefersToDateStr = fieldValueStr; 949 warcRefersToDate = dateFieldValue; 950 break; 951 /* 952 * InetAddress. 953 */ 954 case WarcConstants.FN_IDX_WARC_IP_ADDRESS: 955 warcIpAddress = fieldValueStr; 956 warcInetAddress = inetAddrFieldValue; 957 break; 958 /* 959 * URI. 960 */ 961 case WarcConstants.FN_IDX_WARC_RECORD_ID: 962 warcRecordIdStr = fieldValueStr; 963 warcRecordIdUri = uriFieldValue; 964 break; 965 case WarcConstants.FN_IDX_WARC_CONCURRENT_TO: 966 if (fieldValueStr != null || uriFieldValue != null) { 967 WarcConcurrentTo warcConcurrentTo = new WarcConcurrentTo(); 968 warcConcurrentTo.warcConcurrentToStr = fieldValueStr; 969 warcConcurrentTo.warcConcurrentToUri = uriFieldValue; 970 warcConcurrentToList.add(warcConcurrentTo); 971 } 972 break; 973 case WarcConstants.FN_IDX_WARC_REFERS_TO: 974 warcRefersToStr = fieldValueStr; 975 warcRefersToUri = uriFieldValue; 976 break; 977 case WarcConstants.FN_IDX_WARC_TARGET_URI: 978 warcTargetUriStr = fieldValueStr; 979 warcTargetUriUri = uriFieldValue; 980 break; 981 case WarcConstants.FN_IDX_WARC_WARCINFO_ID: 982 warcWarcinfoIdStr = fieldValueStr; 983 warcWarcinfoIdUri = uriFieldValue; 984 break; 985 case WarcConstants.FN_IDX_WARC_PROFILE: 986 warcProfileStr = fieldValueStr; 987 warcProfileUri = uriFieldValue; 988 if (warcProfileStr != null) { 989 warcProfileIdx = WarcConstants.profileIdxMap.get(warcProfileStr.toLowerCase()); 990 } 991 if (warcProfileIdx == null && warcProfileStr != null && warcProfileStr.length() > 0) { 992 warcProfileIdx = WarcConstants.PROFILE_IDX_UNKNOWN; 993 } 994 break; 995 case WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID: 996 warcSegmentOriginIdStr = fieldValueStr; 997 warcSegmentOriginIdUrl = uriFieldValue; 998 break; 999 case WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI: 1000 warcRefersToTargetUriStr = fieldValueStr; 1001 warcRefersToTargetUriUri = uriFieldValue; 1002 break; 1003 default: 1004 break; 1005 } 1006 } else { 1007 // Invalid datatype for field. 1008 addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED, 1009 "Invalid datatype for '" + fieldName + "' header", 1010 WarcConstants.FDT_IDX_STRINGS[WarcConstants.FN_IDX_DT[fn_idx]], 1011 WarcConstants.FDT_IDX_STRINGS[dt]); 1012 // Consider throwing exception at some point. 1013 } 1014 } 1015 HeaderLine headerLine = new HeaderLine(); 1016 headerLine.name = fieldName; 1017 headerLine.value = fieldValueStr; 1018 HeaderLine tmpLine = headerMap.get(fieldName.toLowerCase()); 1019 if (tmpLine == null) { 1020 headerMap.put(fieldName.toLowerCase(), headerLine); 1021 } else { 1022 tmpLine.lines.add(headerLine); 1023 } 1024 return headerLine; 1025 } 1026 1027 /** 1028 * Validate the WARC header relative to the WARC-Type and according to the 1029 * WARC ISO standard. 1030 */ 1031 protected void checkFields() { 1032 bMandatoryMissing = false; 1033 1034 /* 1035 * Unknown Warc-Type and/or Warc-Profile. 1036 */ 1037 1038 if (warcTypeIdx != null && warcTypeIdx == WarcConstants.RT_IDX_UNKNOWN) { 1039 // Warning: Unknown Warc-Type. 1040 addWarningDiagnosis(DiagnosisType.UNKNOWN, "'" + WarcConstants.FN_WARC_TYPE + "' value", warcTypeStr); 1041 } 1042 1043 if (warcProfileIdx != null && warcProfileIdx == WarcConstants.PROFILE_IDX_UNKNOWN) { 1044 // Warning: Unknown Warc-Profile. 1045 addWarningDiagnosis(DiagnosisType.UNKNOWN, "'" + WarcConstants.FN_WARC_PROFILE + "' value", warcProfileStr); 1046 } 1047 1048 /* 1049 * Mandatory fields. 1050 */ 1051 1052 // TODO Required yes, but is it always invalid. 1053 if (warcTypeIdx == null) { 1054 // Mandatory valid Warc-Type missing. 1055 addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_TYPE + "' header", warcTypeStr); 1056 bMandatoryMissing = true; 1057 } 1058 if (warcRecordIdUri == null) { 1059 // Mandatory valid Warc-Record-Id missing. 1060 addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_RECORD_ID + "' header", warcRecordIdStr); 1061 bMandatoryMissing = true; 1062 } 1063 if (warcDate == null) { 1064 // Mandatory valid Warc-Date missing. 1065 addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_DATE + "' header", warcDateStr); 1066 bMandatoryMissing = true; 1067 } 1068 if (contentLength == null) { 1069 // Mandatory valid Content-Length missing. 1070 addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_CONTENT_LENGTH + "' header", contentLengthStr); 1071 bMandatoryMissing = true; 1072 } 1073 1074 /* 1075 * Content-Type should be present if Content-Length > 0. 1076 * Except for continuation records. 1077 */ 1078 1079 if (contentLength != null && contentLength.longValue() > 0L && 1080 (contentTypeStr == null || contentTypeStr.length() == 0)) { 1081 if (warcTypeIdx == null || warcTypeIdx != WarcConstants.RT_IDX_CONTINUATION) { 1082 addWarningDiagnosis(DiagnosisType.RECOMMENDED_MISSING, 1083 "'" + WarcConstants.FN_CONTENT_TYPE + "' header"); 1084 } 1085 } 1086 1087 /* 1088 * WARC record type dependent policies. 1089 */ 1090 1091 if (warcTypeIdx != null) { 1092 /* 1093 * Warcinfo record should have "application/warc-fields" content-type. 1094 */ 1095 1096 if (warcTypeIdx == WarcConstants.RT_IDX_WARCINFO) { 1097 if (contentType != null && 1098 (!contentType.contentType.equals("application") 1099 || !contentType.mediaType.equals("warc-fields"))) { 1100 addWarningDiagnosis(DiagnosisType.RECOMMENDED, 1101 "'" + WarcConstants.FN_CONTENT_TYPE + "' value", 1102 WarcConstants.CT_APP_WARC_FIELDS, 1103 contentTypeStr); 1104 } 1105 } 1106 1107 if (warcTypeIdx == WarcConstants.RT_IDX_RESPONSE) { 1108 if (warcSegmentNumber != null && warcSegmentNumber != 1) { 1109 addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED, 1110 "'" + WarcConstants.FN_WARC_SEGMENT_NUMBER + "' value", 1111 warcSegmentNumber.toString(), 1112 "1"); 1113 } 1114 } 1115 1116 if (warcTypeIdx == WarcConstants.RT_IDX_CONTINUATION) { 1117 if (warcSegmentNumber != null && warcSegmentNumber < 2) { 1118 addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED, 1119 "'" + WarcConstants.FN_WARC_SEGMENT_NUMBER + "' value", 1120 warcSegmentNumber.toString(), 1121 ">1"); 1122 } 1123 } 1124 1125 /* 1126 * Check the policies for each field. 1127 */ 1128 1129 WarcConcurrentTo warcConcurrentTo; 1130 if (warcTypeIdx > 0) { 1131 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_CONTENT_TYPE, contentType, contentTypeStr); 1132 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_IP_ADDRESS, warcInetAddress, warcIpAddress); 1133 for (int i=0; i<warcConcurrentToList.size(); ++i) { 1134 warcConcurrentTo = warcConcurrentToList.get(0); 1135 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_CONCURRENT_TO, warcConcurrentTo.warcConcurrentToUri, warcConcurrentTo.warcConcurrentToStr); 1136 } 1137 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO, warcRefersToUri, warcRefersToStr); 1138 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_TARGET_URI, warcTargetUriUri, warcTargetUriStr); 1139 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_TRUNCATED, warcTruncatedIdx, warcTruncatedStr); 1140 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_WARCINFO_ID, warcWarcinfoIdUri, warcWarcinfoIdStr); 1141 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_BLOCK_DIGEST, warcBlockDigest, warcBlockDigestStr); 1142 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST, warcPayloadDigest, warcPayloadDigestStr); 1143 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_FILENAME, warcFilename, warcFilename); 1144 // Could also use warcProfileIdx for really strict. 1145 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_PROFILE, warcProfileUri, warcProfileStr); 1146 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE, warcIdentifiedPayloadType, warcIdentifiedPayloadTypeStr); 1147 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER, warcSegmentNumber, warcSegmentNumberStr); 1148 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID, warcSegmentOriginIdUrl, warcSegmentOriginIdStr); 1149 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH, warcSegmentTotalLength, warcSegmentTotalLengthStr); 1150 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI, warcRefersToTargetUriUri, warcRefersToTargetUriStr); 1151 checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO_DATE, warcRefersToDate, warcRefersToDateStr); 1152 } 1153 } 1154 } 1155 1156 /** 1157 * Given a WARC record type and a WARC field looks up the policy in a 1158 * matrix build from the WARC ISO standard. 1159 * @param recordType WARC record type id 1160 * @param fieldType WARC field type id 1161 * @param fieldObj WARC field 1162 * @param valueStr WARC raw field value 1163 */ 1164 protected void checkFieldPolicy(int recordType, int fieldType, Object fieldObj, String valueStr) { 1165 int policy = WarcConstants.field_policy[recordType][fieldType]; 1166 switch (policy) { 1167 case WarcConstants.POLICY_MANDATORY: 1168 if (fieldObj == null) { 1169 addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, 1170 "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value", 1171 valueStr); 1172 } 1173 break; 1174 case WarcConstants.POLICY_SHALL: 1175 if (fieldObj == null) { 1176 addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, 1177 "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value", 1178 valueStr); 1179 } 1180 break; 1181 case WarcConstants.POLICY_SHALL_NOT: 1182 if (fieldObj != null) { 1183 addErrorDiagnosis(DiagnosisType.UNDESIRED_DATA, 1184 "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value", 1185 valueStr); 1186 } 1187 break; 1188 case WarcConstants.POLICY_MAY_NOT: 1189 if (fieldObj != null) { 1190 addWarningDiagnosis(DiagnosisType.UNDESIRED_DATA, 1191 "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value", 1192 valueStr); 1193 } 1194 break; 1195 case WarcConstants.POLICY_MAY: 1196 case WarcConstants.POLICY_IGNORE: 1197 default: 1198 break; 1199 } 1200 } 1201 1202}