001/**
002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
003 * and GZip files. (http://jwat.org/)
004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.jwat.warc;
019
020import org.jwat.common.ByteCountingPushBackInputStream;
021import org.jwat.common.ContentType;
022import org.jwat.common.Diagnosis;
023import org.jwat.common.DiagnosisType;
024import org.jwat.common.Diagnostics;
025import org.jwat.common.HeaderLine;
026import org.jwat.common.MaxLengthRecordingInputStream;
027import org.jwat.common.Uri;
028import org.jwat.common.UriProfile;
029
030import java.io.ByteArrayOutputStream;
031import java.io.IOException;
032import java.net.InetAddress;
033import java.text.DateFormat;
034import java.util.Collections;
035import java.util.Date;
036import java.util.HashMap;
037import java.util.LinkedList;
038import java.util.List;
039import java.util.Map;
040
041/**
042 * Central class for working with WARC headers. This class includes support for
043 * reading and writing WARC headers. Methods are also available to validate
044 * individual headers and a WARC header as a whole.
045 *
046 * @author nicl
047 */
048public class WarcHeader {
049
050    /** An URI with encapsulating <> characters. */
051    public static final boolean URI_LTGT = true;
052
053    /** An URI without encapsulating <> characters. */
054    public static final boolean URI_NAKED = false;
055
056    /** Associated WarcReader context.
057     *  Must be set prior to calling the various methods. */
058    protected WarcReader reader;
059
060    /** Diagnostics used to report diagnoses.
061     *  Must be set prior to calling the various methods. */
062    protected Diagnostics<Diagnosis> diagnostics;
063
064    /** WARC-Target-URI profile. */
065    protected UriProfile warcTargetUriProfile;
066
067    /** URI profile. */
068    protected UriProfile uriProfile;
069
070    /** WARC field parser used.
071     *  Must be set prior to calling the various methods. */
072    protected WarcFieldParsers fieldParsers;
073
074    /** WARC <code>DateFormat</code> as specified by the WARC ISO standard. */
075    protected DateFormat warcDateFormat;
076
077    /** WARC record starting offset relative to the source WARC file input
078     *  stream. The offset is correct for both compressed and uncompressed streams. */
079    protected long startOffset = -1;
080
081    /*
082     * Version related fields.
083     */
084
085    /** Was "WARC/" identified while looking for the version string. */
086    public boolean bMagicIdentified;
087    /** Did the version string include between 2 and 4 substrings delimited by ".". */
088    public boolean bVersionParsed;
089    /** Is the version format valid. */
090    public boolean bValidVersionFormat;
091    /** Is the version recognized. (0.17, 0.18 or 1.0) */
092    public boolean bValidVersion;
093
094    /** Raw version string. */
095    public String versionStr;
096    /** Array based on the version string split by the "." delimiter and converted to integers. */
097    public int[] versionArr;
098
099    /** Major version number from WARC header. */
100    public int major = -1;
101    /** Minor version number from WARC header. */
102    public int minor = -1;
103
104    /*
105     * WARC header fields.
106     */
107
108    /** Array used for duplicate header detection. */
109    protected boolean[] seen = new boolean[WarcConstants.FN_INDEX_OF_LAST];
110
111    /** Is the header missing one of the mandatory headers. */
112    public boolean bMandatoryMissing;
113
114    /** WARC-Type field string value. */
115    public String warcTypeStr;
116    /** WARC-Type converted to an integer id, if identified. */
117    public Integer warcTypeIdx;
118
119    /** WARC-Filename field string value.
120     *  (warcinfo record type only) */
121    public String warcFilename;
122
123    /** WARC-Record-Id field string value. */
124    public String warcRecordIdStr;
125    /** WARC-Record-Id converted to an <code>Uri</code> object, if valid. */
126    public Uri warcRecordIdUri;
127
128    /** WARC-Date field string value. */
129    public String warcDateStr;
130    /** WARC-Date converted to a <code>Date</code> object, if valid. */
131    public Date warcDate;
132
133    /** Content-Length field string value. */
134    public String contentLengthStr;
135    /** Content-Length converted to a <code>Long</code> object, if valid. */
136    public Long contentLength;
137
138    /** Content-Type field string value. */
139    public String contentTypeStr;
140    /** Content-Type converted to a <code>ContentType</code> object, if valid. */
141    public ContentType contentType;
142
143    /** WARC-Truncated field string value. */
144    public String warcTruncatedStr;
145    /** WARC-Truncated converted to an integer id, if valid. */
146    public Integer warcTruncatedIdx;
147
148    /** WARC-IP-Address field string value. */
149    public String warcIpAddress;
150    /** WARC-IP-Address converted to an <code>InetAddress</code> object, if valid. */
151    public InetAddress warcInetAddress;
152
153    /** List of WARC-Concurrent-To field string values and converted <code>URI</code> objects,  if valid. */
154    public List<WarcConcurrentTo> warcConcurrentToList = new LinkedList<WarcConcurrentTo>();
155
156    /** WARC-Refers-To field string value. */
157    public String warcRefersToStr;
158    /** WARC-Refers-To converted to an <code>Uri</code> object, if valid. */
159    public Uri warcRefersToUri;
160
161    /** WARC_Target-URI field string value. */
162    public String warcTargetUriStr;
163    /** WARC-TargetURI converted to an <code>Uri</code> object, if valid. */
164    public Uri warcTargetUriUri;
165
166    /** WARC-Warcinfo-Id field string value. */
167    public String warcWarcinfoIdStr;
168    /** WARC-Warcinfo-Id converted to an <code>Uri</code> object, if valid. */
169    public Uri warcWarcinfoIdUri;
170
171    /** WARC-Block-Digest field string value. */
172    public String warcBlockDigestStr;
173    /** WARC-Block-Digest converted to a <code>WarcDigest</code> object, if valid. */
174    public WarcDigest warcBlockDigest;
175
176    /** WARC-Payload-Digest field string value. */
177    public String warcPayloadDigestStr;
178    /** WARC-Payload-Digest converted to a <code>WarcDigest</code> object, if valid. */
179    public WarcDigest warcPayloadDigest;
180
181    /** WARC-Identified-Payload-Type field string value. */
182    public String warcIdentifiedPayloadTypeStr;
183    /** WARC-Identified-Payload-Type converted to a <code>ContentType</code> object, if valid. */
184    public ContentType warcIdentifiedPayloadType;
185
186    /** WARC-Profile field string value.
187     *  (revisit record only) */
188    public String warcProfileStr;
189    /** WARC-Profile field converted to an <code>Uri</code> object, if valid.
190     *  (revisit record only) */
191    public Uri warcProfileUri;
192    /** WARC-Profile converted to an integer id, if valid.
193     *  (revisit record only) */
194    public Integer warcProfileIdx;
195
196    /** WARC-Segment-Number field string value. */
197    public String warcSegmentNumberStr;
198    /** WARC-Segment-Number converted to an <code>Integer</code> object, if valid. */
199    public Integer warcSegmentNumber;
200
201    /** WARC-Segment-Origin-Id field string value.
202     *  (continuation record only) */
203    public String warcSegmentOriginIdStr;
204    /** WARC-Segment-Origin-Id converted to an <code>Uri</code> object, if valid.
205     *  (continuation record only) */
206    public Uri warcSegmentOriginIdUrl;
207
208    /** WARC-Segment-Total-Length field string value.
209     *  (continuation record only) */
210    public String warcSegmentTotalLengthStr;
211    /** WARC-Segment-Total-Length converted to a <code>Long</code> object, if valid.
212     *  (continuation record only) */
213    public Long warcSegmentTotalLength;
214
215    // see https://docs.google.com/document/d/1QyQBA7Ykgxie75V8Jziz_O7hbhwf7PF6_u9O6w6zgp0/edit
216    /** WARC-Refers-To-Target-URI field string value. */
217    public String warcRefersToTargetUriStr;
218    /** WARC-Refers-To-Target-URI converted to an <code>Uri</code> object, if valid. */
219    public Uri warcRefersToTargetUriUri;
220    /** WARC-Refers-To-Date */
221    public String warcRefersToDateStr;
222    /** WARC-Date converted to a <code>Date</code> object, if valid. */
223    public Date warcRefersToDate;
224
225    /*
226     * WARC header fields collections.
227     */
228
229    /** Raw WARC header output stream. */
230    protected ByteArrayOutputStream headerBytesOut = new ByteArrayOutputStream();
231
232    /** Raw WARC header byte array. */
233    public byte[] headerBytes;
234
235    /** List of parsed header fields. */
236    protected List<HeaderLine> headerList = new LinkedList<HeaderLine>();
237
238    /** Map of parsed header fields. */
239    protected Map<String, HeaderLine> headerMap = new HashMap<String, HeaderLine>();
240
241    /**
242     * Non public constructor to allow unit testing.
243     */
244    protected WarcHeader() {
245    }
246
247    /**
248     * Create and initialize a new <code>WarcHeader</code> for writing.
249     * @param writer writer which shall be used
250     * @param diagnostics diagnostics object used by writer
251     * @return a <code>WarcHeader</code> prepared for writing
252     */
253    public static WarcHeader initHeader(WarcWriter writer, Diagnostics<Diagnosis> diagnostics) {
254        WarcHeader header = new WarcHeader();
255        // Set default version to "1.0".
256        header.major = 1;
257        header.minor = 0;
258        header.warcTargetUriProfile = writer.warcTargetUriProfile;
259        header.uriProfile = writer.uriProfile;
260        header.fieldParsers = writer.fieldParsers;
261        header.warcDateFormat = writer.warcDateFormat;
262        header.diagnostics = diagnostics;
263        return header;
264    }
265
266    /**
267     * Create and initialize a new <code>WarcHeader</code> for reading.
268     * @param reader reader which shall be used
269     * @param startOffset start offset of header
270     * @param diagnostics diagnostics object used by reader
271     * @return a <code>WarcHeader</code> prepared for reading
272     */
273    public static WarcHeader initHeader(WarcReader reader, long startOffset, Diagnostics<Diagnosis> diagnostics) {
274        WarcHeader header = new WarcHeader();
275        header.reader = reader;
276        header.warcTargetUriProfile = reader.warcTargetUriProfile;
277        header.uriProfile = reader.uriProfile;
278        header.fieldParsers = reader.fieldParsers;
279        header.diagnostics = diagnostics;
280        // This is only relevant for uncompressed sequentially read records
281        header.startOffset = startOffset;
282        return header;
283    }
284
285    /**
286     * Add an error diagnosis of the given type on a specific entity with
287     * optional extra information. The information varies according to the
288     * diagnosis type.
289     * @param type diagnosis type
290     * @param entity entity examined
291     * @param information optional extra information
292     */
293    protected void addErrorDiagnosis(DiagnosisType type, String entity, String... information) {
294        diagnostics.addError(new Diagnosis(type, entity, information));
295    }
296
297    /**
298     * Add a warning diagnosis of the given type on a specific entity with
299     * optional extra information. The information varies according to the
300     * diagnosis type.
301     * @param type diagnosis type
302     * @param entity entity examined
303     * @param information optional extra information
304     */
305    protected void addWarningDiagnosis(DiagnosisType type, String entity, String... information) {
306        diagnostics.addWarning(new Diagnosis(type, entity, information));
307    }
308
309    /**
310     * Returns the starting offset of the record in the containing WARC.
311     * @return the starting offset of the record
312     */
313    public long getStartOffset() {
314        return startOffset;
315    }
316
317    /**
318     * Try to parse a WARC header and return a boolean indicating the success or
319     * failure of this.
320     * @param in input stream with WARC data
321     * @return boolean indicating whether a header was parsed or not
322     * @throws IOException if an i/o exception occurs while parsing for a header
323     */
324    public boolean parseHeader(ByteCountingPushBackInputStream in) throws IOException {
325        if (parseVersion(in)) {
326            // debug
327            //System.out.println(wr.bMagicIdentified);
328            //System.out.println(wr.bVersionParsed);
329            //System.out.println(wr.major + "." + wr.minor);
330            if (bVersionParsed && versionArr.length == 2) {
331                switch (major) {
332                case 1:
333                    if (minor == 0) {
334                        bValidVersion = true;
335                    }
336                    break;
337                case 0:
338                    switch (minor) {
339                    case 17:
340                    case 18:
341                        bValidVersion = true;
342                        break;
343                    }
344                    break;
345                default:
346                    break;
347                }
348                if (!bValidVersion) {
349                    diagnostics.addError(
350                            new Diagnosis(DiagnosisType.UNKNOWN,
351                                    "Magic version number", versionStr));
352                }
353            } else {
354                diagnostics.addError(
355                        new Diagnosis(DiagnosisType.INVALID_DATA,
356                                "Magic Version string", versionStr));
357            }
358
359            MaxLengthRecordingInputStream mrin = new MaxLengthRecordingInputStream(in, reader.recordHeaderMaxSize);
360            ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(mrin, reader.recordHeaderMaxSize);
361
362            parseHeaders(pbin);
363            pbin.close();
364
365            checkFields();
366
367            headerBytes = headerBytesOut.toByteArray();
368        }
369        return bMagicIdentified;
370    }
371
372    /**
373     * Looks forward in the input stream for a valid WARC version line.
374     * @param in data input stream
375     * @return true, if magic WARC header found
376     * @throws IOException if an error occurs while reading version data
377     */
378    protected boolean parseVersion(ByteCountingPushBackInputStream in) throws IOException {
379        bMagicIdentified = false;
380        bVersionParsed = false;
381        boolean bInvalidDataBeforeVersion = false;
382        boolean bEmptyLinesBeforeVersion = false;
383        HeaderLine line;
384        String tmpStr;
385        boolean bSeekMagic = true;
386        // Loop until when have found something that looks like a version line.
387        while (bSeekMagic) {
388            // This is only relevant for uncompressed sequentially read records
389            startOffset = in.getConsumed();
390            line = reader.lineReader.readLine(in);
391            if (!reader.lineReader.bEof) {
392                switch (line.type) {
393                case HeaderLine.HLT_LINE:
394                    tmpStr = line.line;
395                    // debug
396                    //System.out.println(tmpStr);
397                    if (tmpStr.length() > 0) {
398                        if (tmpStr.toUpperCase().startsWith(WarcConstants.WARC_MAGIC_HEADER)) {
399                            bMagicIdentified = true;
400                            versionStr = tmpStr.substring(WarcConstants.WARC_MAGIC_HEADER.length());
401                            String[] tmpArr = versionStr.split("\\.", -1);        // Not optimal
402                            if (tmpArr.length >= 2 && tmpArr.length <= 4) {
403                                bVersionParsed = true;
404                                bValidVersionFormat = true;
405                                versionArr = new int[tmpArr.length];
406                                for (int i=0; i<tmpArr.length; ++i) {
407                                    try {
408                                        versionArr[i] = Integer.parseInt(tmpArr[i]);
409                                    } catch (NumberFormatException e) {
410                                        versionArr[i] = -1;
411                                        bValidVersionFormat = false;
412                                    }
413                                }
414                                major = versionArr[0];
415                                minor = versionArr[1];
416                            }
417                            headerBytesOut.write(line.raw);
418                            bSeekMagic = false;
419                        } else {
420                            // Invalid data aka Gibberish.
421                            bInvalidDataBeforeVersion = true;
422                        }
423                    } else {
424                        // Empty line.
425                        bEmptyLinesBeforeVersion = true;
426
427                    }
428                    break;
429                case HeaderLine.HLT_HEADERLINE:
430                    // Invalid data - header or binary.
431                    bInvalidDataBeforeVersion = true;
432                    break;
433                }
434            } else {
435                // EOF.
436                bSeekMagic = false;
437            }
438        }
439        if (bInvalidDataBeforeVersion) {
440            addErrorDiagnosis(DiagnosisType.INVALID, "Data before WARC version");
441        }
442        if (bEmptyLinesBeforeVersion) {
443            addErrorDiagnosis(DiagnosisType.INVALID, "Empty lines before WARC version");
444        }
445        return bMagicIdentified;
446    }
447
448    /**
449     * Reads WARC header lines one line at a time until an empty line is
450     * encountered.
451     * @param in header input stream
452     * @throws IOException if an error occurs while reading the WARC header
453     */
454    protected void parseHeaders(ByteCountingPushBackInputStream in) throws IOException {
455        HeaderLine headerLine;
456        boolean bLoop = true;
457        while (bLoop) {
458            headerLine = reader.headerLineReader.readLine(in);
459            if (!reader.headerLineReader.bEof) {
460                headerBytesOut.write(headerLine.raw);
461                switch (headerLine.type) {
462                case HeaderLine.HLT_HEADERLINE:
463                    if (headerLine.name != null && headerLine.name.length() > 0) {
464                        // debug
465                        //System.out.println(headerLine.name);
466                        //System.out.println(headerLine.value);
467                        addHeader(headerLine);
468                    } else {
469                        // Empty field name.
470                        addWarningDiagnosis(DiagnosisType.EMPTY, "Header line");
471                    }
472                    break;
473                case HeaderLine.HLT_LINE:
474                    if (headerLine.line.length() == 0) {
475                        // Empty line.
476                        bLoop = false;
477                    } else {
478                        // Unknown header line.
479                        addWarningDiagnosis(DiagnosisType.UNKNOWN, "Header line", headerLine.line);
480                    }
481                    break;
482                default:
483                    throw new IllegalStateException("Invalid HeaderLine output!");
484                }
485            } else {
486                // EOF.
487                bLoop = false;
488            }
489        }
490    }
491
492    /**
493     * Identify a (WARC) header name, validate the value and set the header.
494     * @param headerLine the headerLine
495     */
496    protected void addHeader(HeaderLine headerLine) {
497        String fieldName = headerLine.name;
498        String fieldValue = headerLine.value;
499        WarcConcurrentTo warcConcurrentTo;
500        Integer fn_idx = WarcConstants.fieldNameIdxMap.get(fieldName.toLowerCase());
501        if (fn_idx != null) {
502            // WARC field name defined in WARC specification.
503            if (!seen[fn_idx] || WarcConstants.fieldNamesRepeatableLookup[fn_idx]) {
504                seen[fn_idx] = true;
505                switch (fn_idx.intValue()) {
506                case WarcConstants.FN_IDX_WARC_TYPE:
507                    warcTypeStr = fieldParsers.parseString(fieldValue,
508                            WarcConstants.FN_WARC_TYPE);
509                    if (warcTypeStr != null) {
510                        warcTypeIdx = WarcConstants.recordTypeIdxMap.get(warcTypeStr.toLowerCase());
511                    }
512                    if (warcTypeIdx == null && warcTypeStr != null && warcTypeStr.length() > 0) {
513                        warcTypeIdx = WarcConstants.RT_IDX_UNKNOWN;
514                    }
515                    break;
516                case WarcConstants.FN_IDX_WARC_RECORD_ID:
517                    warcRecordIdStr = fieldValue;
518                    warcRecordIdUri = fieldParsers.parseUri(fieldValue, URI_LTGT,
519                            uriProfile, WarcConstants.FN_WARC_RECORD_ID);
520                    break;
521                case WarcConstants.FN_IDX_WARC_DATE:
522                    warcDateStr = fieldValue;
523                    warcDate = fieldParsers.parseDate(fieldValue,
524                            WarcConstants.FN_WARC_DATE);
525                    break;
526                case WarcConstants.FN_IDX_CONTENT_LENGTH:
527                    contentLengthStr = fieldValue;
528                    contentLength = fieldParsers.parseLong(fieldValue,
529                            WarcConstants.FN_CONTENT_LENGTH);
530                    break;
531                case WarcConstants.FN_IDX_CONTENT_TYPE:
532                    contentTypeStr = fieldValue;
533                    contentType = fieldParsers.parseContentType(fieldValue,
534                            WarcConstants.FN_CONTENT_TYPE);
535                    break;
536                case WarcConstants.FN_IDX_WARC_CONCURRENT_TO:
537                    Uri tmpUri = fieldParsers.parseUri(fieldValue, URI_LTGT,
538                            uriProfile, WarcConstants.FN_WARC_CONCURRENT_TO);
539                    if (fieldValue != null && fieldValue.trim().length() > 0) {
540                        warcConcurrentTo = new WarcConcurrentTo();
541                        warcConcurrentTo.warcConcurrentToStr = fieldValue;
542                        warcConcurrentTo.warcConcurrentToUri = tmpUri;
543                        warcConcurrentToList.add(warcConcurrentTo);
544                    }
545                    break;
546                case WarcConstants.FN_IDX_WARC_BLOCK_DIGEST:
547                    warcBlockDigestStr = fieldValue;
548                    warcBlockDigest = fieldParsers.parseDigest(fieldValue,
549                            WarcConstants.FN_WARC_BLOCK_DIGEST);
550                    break;
551                case WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST:
552                    warcPayloadDigestStr = fieldValue;
553                    warcPayloadDigest = fieldParsers.parseDigest(fieldValue,
554                            WarcConstants.FN_WARC_PAYLOAD_DIGEST);
555                    break;
556                case WarcConstants.FN_IDX_WARC_IP_ADDRESS:
557                    warcIpAddress = fieldValue;
558                    warcInetAddress = fieldParsers.parseIpAddress(fieldValue,
559                            WarcConstants.FN_WARC_IP_ADDRESS);
560                    break;
561                case WarcConstants.FN_IDX_WARC_REFERS_TO:
562                    warcRefersToStr = fieldValue;
563                    warcRefersToUri = fieldParsers.parseUri(fieldValue, URI_LTGT,
564                            uriProfile, WarcConstants.FN_WARC_REFERS_TO);
565                    break;
566                case WarcConstants.FN_IDX_WARC_TARGET_URI:
567                    warcTargetUriStr = fieldValue;
568                    warcTargetUriUri = fieldParsers.parseUri(fieldValue, URI_NAKED,
569                            warcTargetUriProfile, WarcConstants.FN_WARC_TARGET_URI);
570                    break;
571                case WarcConstants.FN_IDX_WARC_TRUNCATED:
572                    warcTruncatedStr = fieldParsers.parseString(fieldValue,
573                            WarcConstants.FN_WARC_TRUNCATED);
574                    if (warcTruncatedStr != null) {
575                        warcTruncatedIdx = WarcConstants.truncatedTypeIdxMap.get(warcTruncatedStr.toLowerCase());
576                    }
577                    if (warcTruncatedIdx == null && warcTruncatedStr != null && warcTruncatedStr.length() > 0) {
578                        warcTruncatedIdx = WarcConstants.TT_IDX_FUTURE_REASON;
579                    }
580                    break;
581                case WarcConstants.FN_IDX_WARC_WARCINFO_ID:
582                    warcWarcinfoIdStr = fieldValue;
583                    warcWarcinfoIdUri = fieldParsers.parseUri(fieldValue, URI_LTGT,
584                            uriProfile, WarcConstants.FN_WARC_WARCINFO_ID);
585                    break;
586                case WarcConstants.FN_IDX_WARC_FILENAME:
587                    warcFilename = fieldParsers.parseString(fieldValue,
588                            WarcConstants.FN_WARC_FILENAME);
589                    break;
590                case WarcConstants.FN_IDX_WARC_PROFILE:
591                    warcProfileStr = fieldValue;
592                    warcProfileUri = fieldParsers.parseUri(fieldValue, URI_NAKED,
593                            uriProfile, WarcConstants.FN_WARC_PROFILE);
594                    if (warcProfileStr != null) {
595                        warcProfileIdx = WarcConstants.profileIdxMap.get(warcProfileStr.toLowerCase());
596                    }
597                    if (warcProfileIdx == null && warcProfileStr != null && warcProfileStr.length() > 0) {
598                        warcProfileIdx = WarcConstants.PROFILE_IDX_UNKNOWN;
599                    }
600                    break;
601                case WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE:
602                    warcIdentifiedPayloadTypeStr = fieldValue;
603                    warcIdentifiedPayloadType = fieldParsers.parseContentType(fieldValue,
604                            WarcConstants.FN_WARC_IDENTIFIED_PAYLOAD_TYPE);
605                    break;
606                case WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID:
607                    warcSegmentOriginIdStr = fieldValue;
608                    warcSegmentOriginIdUrl = fieldParsers.parseUri(fieldValue, URI_LTGT,
609                            uriProfile, WarcConstants.FN_WARC_SEGMENT_ORIGIN_ID);
610                    break;
611                case WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER:
612                    warcSegmentNumberStr = fieldValue;
613                    warcSegmentNumber = fieldParsers.parseInteger(fieldValue,
614                            WarcConstants.FN_WARC_SEGMENT_NUMBER);
615                    break;
616                case WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH:
617                    warcSegmentTotalLengthStr = fieldValue;
618                    warcSegmentTotalLength = fieldParsers.parseLong(fieldValue,
619                            WarcConstants.FN_WARC_SEGMENT_TOTAL_LENGTH);
620                    break;
621                case WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI:
622                    warcRefersToTargetUriStr = fieldValue;
623                    warcRefersToTargetUriUri = fieldParsers.parseUri(fieldValue, URI_NAKED,
624                            uriProfile, WarcConstants.FN_WARC_REFERS_TO_TARGET_URI);
625                    break;
626                case WarcConstants.FN_IDX_WARC_REFERS_TO_DATE:
627                    warcRefersToDateStr = fieldValue;
628                    warcRefersToDate = fieldParsers.parseDate(fieldValue,
629                            WarcConstants.FN_WARC_REFERS_TO_DATE);
630                    break;
631                }
632            } else {
633                // Duplicate field.
634                addErrorDiagnosis(DiagnosisType.DUPLICATE, "'" + fieldName + "' header", fieldValue);
635            }
636        }
637        HeaderLine tmpLine = headerMap.get(fieldName.toLowerCase());
638        if (tmpLine == null) {
639            headerMap.put(fieldName.toLowerCase(), headerLine);
640        } else {
641            tmpLine.lines.add(headerLine);
642        }
643        headerList.add(headerLine);
644    }
645
646    /**
647     * Get a <code>List</code> of all the headers found during parsing.
648     * @return <code>List</code> of <code>HeaderLine</code>
649     */
650    public List<HeaderLine> getHeaderList() {
651        return Collections.unmodifiableList(headerList);
652    }
653
654    /**
655     * Get a header line structure or null, if no header line structure is
656     * stored with the given header name.
657     * @param field header name
658     * @return <code>HeaderLine</code> structure or null
659     */
660    public HeaderLine getHeader(String field) {
661        if (field != null && field.length() > 0) {
662            return headerMap.get(field.toLowerCase());
663        } else {
664            return null;
665        }
666    }
667
668    /**
669     * Add a String header using the supplied string and return a
670     * <code>HeaderLine</code> object corresponding to how the header would be
671     * read.
672     * @param fieldName name of field to add
673     * @param fieldValue field value string
674     * @return <code>HeaderLine</code> object corresponding to what would have been read
675     */
676    public HeaderLine addHeader(String fieldName, String fieldValue) {
677        HeaderLine headerLine = new HeaderLine();
678        headerLine.name = fieldName;
679        headerLine.value = fieldValue;
680        addHeader(headerLine);
681        return headerLine;
682    }
683
684    /**
685     * Add an Integer header using the supplied string and object values and return
686     * a <code>HeaderLine</code> object corresponding to how the header would be read.
687     * If both string and object values are not null they are used as is.
688     * If the string value is null and the object is not null,
689     * the object's toString method is called.
690     * If the object is null and the string is not null, the string is parsed
691     * and validated resulting in an object, if valid.
692     * @param fieldName name of field to add
693     * @param integerFieldValue <code>Integer</code> field value object
694     * @param fieldValueStr Integer field value string
695     * @return <code>HeaderLine</code> object corresponding to what would have been read
696     */
697    public HeaderLine addHeader(String fieldName, Integer integerFieldValue, String fieldValueStr) {
698        if (integerFieldValue == null && fieldValueStr != null) {
699            integerFieldValue = fieldParsers.parseInteger(fieldValueStr, fieldName);
700        } else if (fieldValueStr == null && integerFieldValue != null) {
701            fieldValueStr = integerFieldValue.toString();
702        }
703        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_INTEGER,
704                integerFieldValue, null, null, null, null, null, null);
705    }
706
707    /**
708     * Add a Long header using the supplied string and object values and return
709     * a <code>HeaderLine</code> object corresponding to how the header would be read.
710     * If both string and object values are not null they are used as is.
711     * If the string value is null and the object is not null,
712     * the object's toString method is called.
713     * If the object is null and the string is not null, the string is parsed
714     * and validated resulting in an object, if valid.
715     * @param fieldName name of field to add
716     * @param longFieldValue <code>Long</code> field value object
717     * @param fieldValueStr Long field value string
718     * @return <code>HeaderLine</code> object corresponding to what would have been read
719     */
720    public HeaderLine addHeader(String fieldName, Long longFieldValue, String fieldValueStr) {
721        if (longFieldValue == null && fieldValueStr != null) {
722            longFieldValue = fieldParsers.parseLong(fieldValueStr, fieldName);
723        } else if (fieldValueStr == null && longFieldValue != null) {
724            fieldValueStr = longFieldValue.toString();
725        }
726        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_LONG,
727                null, longFieldValue, null, null, null, null, null);
728    }
729
730    /**
731     * Add an Digest header using the supplied string and object values and return
732     * a <code>HeaderLine</code> object corresponding to how the header would be read.
733     * If both string and object values are not null they are used as is.
734     * If the string value is null and the object is not null,
735     * the object's toString method is called.
736     * If the object is null and the string is not null, the string is parsed
737     * and validated resulting in an object, if valid.
738     * @param fieldName name of field to add
739     * @param digestFieldValue <code>Digest</code> field value object
740     * @param fieldValueStr Digest field value string
741     * @return <code>HeaderLine</code> object corresponding to what would have been read
742     */
743    public HeaderLine addHeader(String fieldName, WarcDigest digestFieldValue, String fieldValueStr) {
744        if (digestFieldValue == null && fieldValueStr != null) {
745            digestFieldValue = fieldParsers.parseDigest(fieldValueStr, fieldName);
746        } else if (fieldValueStr == null && digestFieldValue != null) {
747            fieldValueStr = digestFieldValue.toString();
748        }
749        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_DIGEST,
750                null, null, digestFieldValue, null, null, null, null);
751    }
752
753    /**
754     * Add an Content-Type header using the supplied string and object values and return
755     * a <code>HeaderLine</code> object corresponding to how the header would be read.
756     * If both string and object values are not null they are used as is.
757     * If the string value is null and the object is not null,
758     * the object's toString method is called.
759     * If the object is null and the string is not null, the string is parsed
760     * and validated resulting in an object, if valid.
761     * @param fieldName name of field to add
762     * @param contentTypeFieldValue <code>ContentType</code> field value object
763     * @param fieldValueStr Content-Type field value string
764     * @return <code>HeaderLine</code> object corresponding to what would have been read
765     */
766    public HeaderLine addHeader(String fieldName, ContentType contentTypeFieldValue, String fieldValueStr) {
767        if (contentTypeFieldValue == null && fieldValueStr != null) {
768            contentTypeFieldValue = fieldParsers.parseContentType(fieldValueStr, fieldName);
769        } else if (fieldValueStr == null && contentTypeFieldValue != null) {
770            fieldValueStr = contentTypeFieldValue.toString();
771        }
772        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_CONTENTTYPE,
773                null, null, null, contentTypeFieldValue, null, null, null);
774    }
775
776    /**
777     * Add an Date header using the supplied string and object values and return
778     * a <code>HeaderLine</code> object corresponding to how the header would be read.
779     * If both string and object values are not null they are used as is.
780     * If the string value is null and the object is not null,
781     * the object's toString method is called.
782     * If the object is null and the string is not null, the string is parsed
783     * and validated resulting in an object, if valid.
784     * @param fieldName name of field to add
785     * @param dateFieldValue <code>Date</code> field value object
786     * @param fieldValueStr Date field value string
787     * @return <code>HeaderLine</code> object corresponding to what would have been read
788     */
789    public HeaderLine addHeader(String fieldName, Date dateFieldValue, String fieldValueStr) {
790        if (dateFieldValue == null && fieldValueStr != null) {
791            dateFieldValue = fieldParsers.parseDate(fieldValueStr, fieldName);
792        } else if (fieldValueStr == null && dateFieldValue != null) {
793            fieldValueStr = warcDateFormat.format(dateFieldValue);
794        }
795        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_DATE,
796                null, null, null, null, dateFieldValue, null, null);
797    }
798
799    /**
800     * Add an InetAddress header using the supplied string and object values and return
801     * a <code>HeaderLine</code> object corresponding to how the header would be read.
802     * If both string and object values are not null they are used as is.
803     * If the string value is null and the object is not null,
804     * the object's toString method is called.
805     * If the object is null and the string is not null, the string is parsed
806     * and validated resulting in an object, if valid.
807     * @param fieldName name of field to add
808     * @param inetAddrFieldValue <code>InetAddress</code> field value object
809     * @param fieldValueStr IP-Address field value string
810     * @return <code>HeaderLine</code> object corresponding to what would have been read
811     */
812    public HeaderLine addHeader(String fieldName, InetAddress inetAddrFieldValue, String fieldValueStr) {
813        if (inetAddrFieldValue == null && fieldValueStr != null) {
814            inetAddrFieldValue = fieldParsers.parseIpAddress(fieldValueStr, fieldName);
815        } else if (fieldValueStr == null && inetAddrFieldValue != null) {
816            fieldValueStr = inetAddrFieldValue.getHostAddress();
817        }
818        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_INETADDRESS,
819                null, null, null, null, null, inetAddrFieldValue, null);
820    }
821
822    /**
823     * Add an URI header using the supplied string and object values and return
824     * a <code>HeaderLine</code> object corresponding to how the header would be read.
825     * If both string and object values are not null they are used as is.
826     * If the string value is null and the object is not null,
827     * the object's toString method is called.
828     * If the object is null and the string is not null, the string is parsed
829     * and validated resulting in an object, if valid.
830     * @param fieldName name of field to add
831     * @param uriFieldValue <code>URI</code> field value object
832     * @param fieldValueStr URI field value string
833     * @return <code>HeaderLine</code> object corresponding to what would have been read
834     */
835    public HeaderLine addHeader(String fieldName, Uri uriFieldValue, String fieldValueStr) {
836        if (uriFieldValue == null && fieldValueStr != null) {
837            if (WarcConstants.FN_WARC_TARGET_URI.equalsIgnoreCase(fieldName)) {
838                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, warcTargetUriProfile, fieldName);
839            } else if (WarcConstants.FN_WARC_PROFILE.equalsIgnoreCase(fieldName)) {
840                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, uriProfile, fieldName);
841            } else if (WarcConstants.FN_WARC_REFERS_TO_TARGET_URI.equalsIgnoreCase(fieldName)) {
842                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, warcTargetUriProfile, fieldName);
843            } else {
844                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_LTGT, uriProfile, fieldName);
845            }
846        } else if (fieldValueStr == null && uriFieldValue != null) {
847            if (WarcConstants.FN_WARC_TARGET_URI.equalsIgnoreCase(fieldName)
848                    || WarcConstants.FN_WARC_PROFILE.equalsIgnoreCase(fieldName)
849                    || WarcConstants.FN_WARC_REFERS_TO_TARGET_URI.equalsIgnoreCase(fieldName)) {
850                fieldValueStr = uriFieldValue.toString();
851            } else {
852                fieldValueStr = "<" + uriFieldValue.toString() + ">";
853            }
854        }
855        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_URI,
856                null, null, null, null, null, null, uriFieldValue);
857    }
858
859    /**
860     * Add a header with the supplied field name, data type and value and
861     * return a <code>HeaderLine</code> corresponding to how the header will
862     * be read. The data type is validated against the field data type.
863     * The values used are the field value string and the parameter
864     * corresponding to the data type.
865     * @param fieldName header field name
866     * @param fieldValueStr field value in string form
867     * @param dt data type of the field value string when converted to an object
868     * @param integerFieldValue <code>Integer</code> object field value
869     * @param longFieldValue <code>Long</code> object field value
870     * @param digestFieldValue <code>Digest</code> object field value
871     * @param contentTypeFieldValue <code>ContentType</code> object field value
872     * @param dateFieldValue <code>Date</code> object field value
873     * @param inetAddrFieldValue <code>InetAddress</code> object field value
874     * @param uriFieldValue <code>URI</code> object field value
875     * @return <code>HeaderLine</code> object corresponding to what would have been read
876     */
877    public HeaderLine addHeader(String fieldName, String fieldValueStr, int dt,
878            Integer integerFieldValue, Long longFieldValue,
879            WarcDigest digestFieldValue, ContentType contentTypeFieldValue,
880            Date dateFieldValue, InetAddress inetAddrFieldValue,
881            Uri uriFieldValue) {
882        Integer fn_idx = WarcConstants.fieldNameIdxMap.get(fieldName.toLowerCase());
883        if (fn_idx != null) {
884            // Implicit cast from integer to long, if needed.
885            if (WarcConstants.FN_IDX_DT[fn_idx] == WarcConstants.FDT_LONG
886                    && dt == WarcConstants.FDT_INTEGER) {
887                longFieldValue = (long)integerFieldValue;
888                dt = WarcConstants.FDT_LONG;
889            }
890            if (dt == WarcConstants.FN_IDX_DT[fn_idx]) {
891                // WARC field name defined in WARC specification.
892                if (seen[fn_idx] && !WarcConstants.fieldNamesRepeatableLookup[fn_idx]) {
893                    // Duplicate field.
894                    addErrorDiagnosis(DiagnosisType.DUPLICATE,
895                            "'" + fieldName + "' header",
896                            fieldValueStr);
897                }
898                seen[fn_idx] = true;
899                switch (fn_idx.intValue()) {
900                /*
901                 * Integer.
902                 */
903                case WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER:
904                    warcSegmentNumberStr = fieldValueStr;
905                    warcSegmentNumber = integerFieldValue;
906                    break;
907                /*
908                 * Long.
909                 */
910                case WarcConstants.FN_IDX_CONTENT_LENGTH:
911                    contentLengthStr = fieldValueStr;
912                    contentLength = longFieldValue;
913                    break;
914                case WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH:
915                    warcSegmentTotalLengthStr = fieldValueStr;
916                    warcSegmentTotalLength = longFieldValue;
917                    break;
918                /*
919                 * Digest.
920                 */
921                case WarcConstants.FN_IDX_WARC_BLOCK_DIGEST:
922                    warcBlockDigestStr = fieldValueStr;
923                    warcBlockDigest = digestFieldValue;
924                    break;
925                case WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST:
926                    warcPayloadDigestStr = fieldValueStr;
927                    warcPayloadDigest = digestFieldValue;
928                    break;
929                /*
930                 * ContentType.
931                 */
932                case WarcConstants.FN_IDX_CONTENT_TYPE:
933                    contentTypeStr = fieldValueStr;
934                    contentType = contentTypeFieldValue;
935                    break;
936                case WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE:
937                    warcIdentifiedPayloadTypeStr = fieldValueStr;
938                    warcIdentifiedPayloadType = contentTypeFieldValue;
939                    break;
940                /*
941                 * Date.
942                 */
943                case WarcConstants.FN_IDX_WARC_DATE:
944                    warcDateStr = fieldValueStr;
945                    warcDate = dateFieldValue;
946                    break;
947                case WarcConstants.FN_IDX_WARC_REFERS_TO_DATE:
948                    warcRefersToDateStr = fieldValueStr;
949                    warcRefersToDate = dateFieldValue;
950                    break;
951                /*
952                 * InetAddress.
953                 */
954                case WarcConstants.FN_IDX_WARC_IP_ADDRESS:
955                    warcIpAddress = fieldValueStr;
956                    warcInetAddress = inetAddrFieldValue;
957                    break;
958                /*
959                 * URI.
960                 */
961                case WarcConstants.FN_IDX_WARC_RECORD_ID:
962                    warcRecordIdStr = fieldValueStr;
963                    warcRecordIdUri = uriFieldValue;
964                    break;
965                case WarcConstants.FN_IDX_WARC_CONCURRENT_TO:
966                    if (fieldValueStr != null || uriFieldValue != null) {
967                        WarcConcurrentTo warcConcurrentTo = new WarcConcurrentTo();
968                        warcConcurrentTo.warcConcurrentToStr = fieldValueStr;
969                        warcConcurrentTo.warcConcurrentToUri = uriFieldValue;
970                        warcConcurrentToList.add(warcConcurrentTo);
971                    }
972                    break;
973                case WarcConstants.FN_IDX_WARC_REFERS_TO:
974                    warcRefersToStr = fieldValueStr;
975                    warcRefersToUri = uriFieldValue;
976                    break;
977                case WarcConstants.FN_IDX_WARC_TARGET_URI:
978                    warcTargetUriStr = fieldValueStr;
979                    warcTargetUriUri = uriFieldValue;
980                    break;
981                case WarcConstants.FN_IDX_WARC_WARCINFO_ID:
982                    warcWarcinfoIdStr = fieldValueStr;
983                    warcWarcinfoIdUri = uriFieldValue;
984                    break;
985                case WarcConstants.FN_IDX_WARC_PROFILE:
986                    warcProfileStr = fieldValueStr;
987                    warcProfileUri = uriFieldValue;
988                    if (warcProfileStr != null) {
989                        warcProfileIdx = WarcConstants.profileIdxMap.get(warcProfileStr.toLowerCase());
990                    }
991                    if (warcProfileIdx == null && warcProfileStr != null && warcProfileStr.length() > 0) {
992                        warcProfileIdx = WarcConstants.PROFILE_IDX_UNKNOWN;
993                    }
994                    break;
995                case WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID:
996                    warcSegmentOriginIdStr = fieldValueStr;
997                    warcSegmentOriginIdUrl = uriFieldValue;
998                    break;
999                case WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI:
1000                    warcRefersToTargetUriStr = fieldValueStr;
1001                    warcRefersToTargetUriUri = uriFieldValue;
1002                    break;
1003                default:
1004                    break;
1005                }
1006            } else {
1007                // Invalid datatype for field.
1008                addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
1009                        "Invalid datatype for '" + fieldName + "' header",
1010                        WarcConstants.FDT_IDX_STRINGS[WarcConstants.FN_IDX_DT[fn_idx]],
1011                        WarcConstants.FDT_IDX_STRINGS[dt]);
1012                // Consider throwing exception at some point.
1013            }
1014        }
1015        HeaderLine headerLine = new HeaderLine();
1016        headerLine.name = fieldName;
1017        headerLine.value = fieldValueStr;
1018        HeaderLine tmpLine = headerMap.get(fieldName.toLowerCase());
1019        if (tmpLine == null) {
1020            headerMap.put(fieldName.toLowerCase(), headerLine);
1021        } else {
1022            tmpLine.lines.add(headerLine);
1023        }
1024        return headerLine;
1025    }
1026
1027    /**
1028     * Validate the WARC header relative to the WARC-Type and according to the
1029     * WARC ISO standard.
1030     */
1031    protected void checkFields() {
1032        bMandatoryMissing = false;
1033
1034        /*
1035         * Unknown Warc-Type and/or Warc-Profile.
1036         */
1037
1038        if (warcTypeIdx != null && warcTypeIdx == WarcConstants.RT_IDX_UNKNOWN) {
1039            // Warning: Unknown Warc-Type.
1040            addWarningDiagnosis(DiagnosisType.UNKNOWN, "'" + WarcConstants.FN_WARC_TYPE + "' value", warcTypeStr);
1041        }
1042
1043        if (warcProfileIdx != null && warcProfileIdx == WarcConstants.PROFILE_IDX_UNKNOWN) {
1044            // Warning: Unknown Warc-Profile.
1045            addWarningDiagnosis(DiagnosisType.UNKNOWN, "'" + WarcConstants.FN_WARC_PROFILE + "' value", warcProfileStr);
1046        }
1047
1048        /*
1049         * Mandatory fields.
1050         */
1051
1052        // TODO Required yes, but is it always invalid.
1053        if (warcTypeIdx == null) {
1054            // Mandatory valid Warc-Type missing.
1055            addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_TYPE + "' header", warcTypeStr);
1056            bMandatoryMissing = true;
1057        }
1058        if (warcRecordIdUri == null) {
1059            // Mandatory valid Warc-Record-Id missing.
1060            addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_RECORD_ID + "' header", warcRecordIdStr);
1061            bMandatoryMissing = true;
1062        }
1063        if (warcDate == null) {
1064            // Mandatory valid Warc-Date missing.
1065            addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_DATE + "' header", warcDateStr);
1066            bMandatoryMissing = true;
1067        }
1068        if (contentLength == null) {
1069            // Mandatory valid Content-Length missing.
1070            addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_CONTENT_LENGTH + "' header", contentLengthStr);
1071            bMandatoryMissing = true;
1072        }
1073
1074        /*
1075         * Content-Type should be present if Content-Length > 0.
1076         * Except for continuation records.
1077         */
1078
1079        if (contentLength != null && contentLength.longValue() > 0L &&
1080                        (contentTypeStr == null || contentTypeStr.length() == 0)) {
1081            if (warcTypeIdx == null || warcTypeIdx != WarcConstants.RT_IDX_CONTINUATION) {
1082                addWarningDiagnosis(DiagnosisType.RECOMMENDED_MISSING,
1083                        "'" + WarcConstants.FN_CONTENT_TYPE + "' header");
1084            }
1085        }
1086
1087        /*
1088         * WARC record type dependent policies.
1089         */
1090
1091        if (warcTypeIdx != null) {
1092            /*
1093             * Warcinfo record should have "application/warc-fields" content-type.
1094             */
1095
1096            if (warcTypeIdx == WarcConstants.RT_IDX_WARCINFO) {
1097                if (contentType != null &&
1098                        (!contentType.contentType.equals("application")
1099                        || !contentType.mediaType.equals("warc-fields"))) {
1100                    addWarningDiagnosis(DiagnosisType.RECOMMENDED,
1101                            "'" + WarcConstants.FN_CONTENT_TYPE + "' value",
1102                            WarcConstants.CT_APP_WARC_FIELDS,
1103                            contentTypeStr);
1104                }
1105            }
1106
1107            if (warcTypeIdx == WarcConstants.RT_IDX_RESPONSE) {
1108                if (warcSegmentNumber != null && warcSegmentNumber != 1) {
1109                    addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
1110                            "'" + WarcConstants.FN_WARC_SEGMENT_NUMBER + "' value",
1111                            warcSegmentNumber.toString(),
1112                            "1");
1113                }
1114            }
1115
1116            if (warcTypeIdx == WarcConstants.RT_IDX_CONTINUATION) {
1117                if (warcSegmentNumber != null && warcSegmentNumber < 2) {
1118                    addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
1119                            "'" + WarcConstants.FN_WARC_SEGMENT_NUMBER + "' value",
1120                            warcSegmentNumber.toString(),
1121                            ">1");
1122                }
1123            }
1124
1125            /*
1126             * Check the policies for each field.
1127             */
1128
1129            WarcConcurrentTo warcConcurrentTo;
1130            if (warcTypeIdx  > 0) {
1131                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_CONTENT_TYPE, contentType, contentTypeStr);
1132                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_IP_ADDRESS, warcInetAddress, warcIpAddress);
1133                for (int i=0; i<warcConcurrentToList.size(); ++i) {
1134                    warcConcurrentTo = warcConcurrentToList.get(0);
1135                    checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_CONCURRENT_TO, warcConcurrentTo.warcConcurrentToUri, warcConcurrentTo.warcConcurrentToStr);
1136                }
1137                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO, warcRefersToUri, warcRefersToStr);
1138                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_TARGET_URI, warcTargetUriUri, warcTargetUriStr);
1139                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_TRUNCATED, warcTruncatedIdx, warcTruncatedStr);
1140                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_WARCINFO_ID, warcWarcinfoIdUri, warcWarcinfoIdStr);
1141                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_BLOCK_DIGEST, warcBlockDigest, warcBlockDigestStr);
1142                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST, warcPayloadDigest, warcPayloadDigestStr);
1143                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_FILENAME, warcFilename, warcFilename);
1144                // Could also use warcProfileIdx for really strict.
1145                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_PROFILE, warcProfileUri, warcProfileStr);
1146                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE, warcIdentifiedPayloadType, warcIdentifiedPayloadTypeStr);
1147                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER, warcSegmentNumber, warcSegmentNumberStr);
1148                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID, warcSegmentOriginIdUrl, warcSegmentOriginIdStr);
1149                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH, warcSegmentTotalLength, warcSegmentTotalLengthStr);
1150                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI, warcRefersToTargetUriUri, warcRefersToTargetUriStr);
1151                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO_DATE, warcRefersToDate, warcRefersToDateStr);
1152            }
1153        }
1154    }
1155
1156    /**
1157     * Given a WARC record type and a WARC field looks up the policy in a
1158     * matrix build from the WARC ISO standard.
1159     * @param recordType WARC record type id
1160     * @param fieldType WARC field type id
1161     * @param fieldObj WARC field
1162     * @param valueStr WARC raw field value
1163     */
1164    protected void checkFieldPolicy(int recordType, int fieldType, Object fieldObj, String valueStr) {
1165        int policy = WarcConstants.field_policy[recordType][fieldType];
1166        switch (policy) {
1167        case WarcConstants.POLICY_MANDATORY:
1168            if (fieldObj == null) {
1169                addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID,
1170                        "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value",
1171                        valueStr);
1172            }
1173            break;
1174        case WarcConstants.POLICY_SHALL:
1175            if (fieldObj == null) {
1176                addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID,
1177                        "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value",
1178                        valueStr);
1179            }
1180            break;
1181        case WarcConstants.POLICY_SHALL_NOT:
1182            if (fieldObj != null) {
1183                addErrorDiagnosis(DiagnosisType.UNDESIRED_DATA,
1184                        "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value",
1185                        valueStr);
1186            }
1187            break;
1188        case WarcConstants.POLICY_MAY_NOT:
1189            if (fieldObj != null) {
1190                addWarningDiagnosis(DiagnosisType.UNDESIRED_DATA,
1191                        "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value",
1192                        valueStr);
1193            }
1194            break;
1195        case WarcConstants.POLICY_MAY:
1196        case WarcConstants.POLICY_IGNORE:
1197        default:
1198            break;
1199        }
1200    }
1201
1202}