001/**
002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
003 * and GZip files. (http://jwat.org/)
004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.jwat.warc;
019
020import java.util.HashMap;
021import java.util.Map;
022
023/**
024 * Class containing all relevant WARC constants and structures.
025 * Including but not limited to field names and mime-types.
026 * Also includes statically initialized structures for validation.
027 *
028 * @author nicl
029 */
030public class WarcConstants {
031
032    /**
033     * This utility class does not require instantiation.
034     */
035    protected WarcConstants() {
036    }
037
038    /**
039     * A WARC header block starts with this string including trailing version
040     * information.
041     * */
042    public static final String WARC_MAGIC_HEADER = "WARC/";
043
044    /** End mark used after each record consisting of two newlines. */
045    protected static byte[] endMark = "\r\n\r\n".getBytes();
046
047    /** WARC date format string as specified by the WARC ISO standard. */
048    public static final String WARC_DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";
049
050    /** WARC digest format string as specified by the WARC ISO standard. */
051    public static final String WARC_DIGEST_FORMAT = "<digest-algorithm>:<digest-encoded>";
052
053    /** Content-type format string as specified in RFC2616.*/
054    public static final String CONTENT_TYPE_FORMAT = "<type>/<sub-type>(; <argument>=<value>)*";
055
056    /*
057     * WARC content-types (MIME).
058     */
059
060    /** WARC mime type. */
061    public static final String WARC_MIME_TYPE = "application/warc";
062
063    /** Suggested content-type/media-type for metadata records and others. */
064    public static final String CT_APP_WARC_FIELDS = "application/warc-fields";
065
066    /** Suggested content-type for metadata records and others. */
067    public static final String CONTENT_TYPE_METADATA = "application";
068
069    /** Suggested media-type for metadata records and others. */
070    public static final String MEDIA_TYPE_METADATA = "warc-fields";
071
072    //"text/dns"
073    //"application/http;msgtype=request"
074    //"application/http;msgtype=response"
075
076    /*
077     * Voodoo magic constants.
078     */
079
080    /** Trailing newlines after each record as per the WARC ISO standard. */
081    public static final int WARC_RECORD_TRAILING_NEWLINES = 2;
082
083    /** Number of WARC fields. */
084    public static final int FN_NUMBER = 21;
085
086    /** Index of last WARC field (zero-indexed). */
087    public static final int FN_INDEX_OF_LAST = FN_NUMBER+1;
088
089    /** Number of WARC types. */
090    public static final int RT_NUMBER = 8;
091
092    /** Index of last WARC type (zero indexed). */
093    public static final int RT_INDEX_OF_LAST = RT_NUMBER+1;
094
095    /*
096     * WARC field names.
097     */
098
099    /** Warc-type field name. */
100    public static final String FN_WARC_TYPE = "WARC-Type";
101    /** Warc-record-id field name. */
102    public static final String FN_WARC_RECORD_ID = "WARC-Record-ID";
103    /** Warc-date field name. */
104    public static final String FN_WARC_DATE = "WARC-Date";
105    /** Content-length field name. */
106    public static final String FN_CONTENT_LENGTH = "Content-Length";
107    /** Content-type field name. */
108    public static final String FN_CONTENT_TYPE = "Content-Type";
109    /** Warc-concurrent-to field name. */
110    public static final String FN_WARC_CONCURRENT_TO = "WARC-Concurrent-To";
111    /** Warc-block-digest field name. */
112    public static final String FN_WARC_BLOCK_DIGEST = "WARC-Block-Digest";
113    /** Warc-payload-digest field name. */
114    public static final String FN_WARC_PAYLOAD_DIGEST = "WARC-Payload-Digest";
115    /** Warc-ip-address field name. */
116    public static final String FN_WARC_IP_ADDRESS = "WARC-IP-Address";
117    /** Warc-refers-to field name. */
118    public static final String FN_WARC_REFERS_TO = "WARC-Refers-To";
119    /** Warc-target-uri field name. */
120    public static final String FN_WARC_TARGET_URI = "WARC-Target-URI";
121    /** Warc-truncated field name. */
122    public static final String FN_WARC_TRUNCATED = "WARC-Truncated";
123    /** Warc-warcinfo-id field name. */
124    public static final String FN_WARC_WARCINFO_ID = "WARC-Warcinfo-ID";
125    /** Warc-filename field name. */
126    public static final String FN_WARC_FILENAME = "WARC-Filename";
127    /** Warc-profile field name. */
128    public static final String FN_WARC_PROFILE = "WARC-Profile";
129    /** Warc-identified-payload-type field name. */
130    public static final String FN_WARC_IDENTIFIED_PAYLOAD_TYPE = "WARC-Identified-Payload-Type";
131    /** Warc-segment-origin-id field name. */
132    public static final String FN_WARC_SEGMENT_ORIGIN_ID = "WARC-Segment-Origin-ID";
133    /** Warc-segment-number field name. */
134    public static final String FN_WARC_SEGMENT_NUMBER = "WARC-Segment-Number";
135    /** Warc-segment-totalt-length field name. */
136    public static final String FN_WARC_SEGMENT_TOTAL_LENGTH = "WARC-Segment-Total-Length";
137    /** WARC-Refers-To-Target-URI field name. */
138    public static final String FN_WARC_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI";
139    /** WARC-Refers-To-Date field name. */
140    public static final String FN_WARC_REFERS_TO_DATE = "WARC-Refers-To-Date";
141
142    /** WARC field name id to field name mapping table.
143     *  Zero indexed array with all indexes used &gt; 1. (Index 0 is unused) */
144    public static final String[] FN_IDX_STRINGS = {
145        null,
146        FN_WARC_TYPE,
147        FN_WARC_RECORD_ID,
148        FN_WARC_DATE,
149        FN_CONTENT_LENGTH,
150        FN_CONTENT_TYPE,
151        FN_WARC_CONCURRENT_TO,
152        FN_WARC_BLOCK_DIGEST,
153        FN_WARC_PAYLOAD_DIGEST,
154        FN_WARC_IP_ADDRESS,
155        FN_WARC_REFERS_TO,
156        FN_WARC_TARGET_URI,
157        FN_WARC_TRUNCATED,
158        FN_WARC_WARCINFO_ID,
159        FN_WARC_FILENAME,
160        FN_WARC_PROFILE,
161        FN_WARC_IDENTIFIED_PAYLOAD_TYPE,
162        FN_WARC_SEGMENT_ORIGIN_ID,
163        FN_WARC_SEGMENT_NUMBER,
164        FN_WARC_SEGMENT_TOTAL_LENGTH,
165        FN_WARC_REFERS_TO_TARGET_URI,
166        FN_WARC_REFERS_TO_DATE
167    };
168
169    /** Warc reader warc-type field name id. */
170    public static final int FN_IDX_WARC_TYPE = 1;
171    /** Warc reader warc-record-id field name id. */
172    public static final int FN_IDX_WARC_RECORD_ID = 2;
173    /** Warc reader warc-date field name id. */
174    public static final int FN_IDX_WARC_DATE = 3;
175    /** Warc reader content-length field name id. */
176    public static final int FN_IDX_CONTENT_LENGTH = 4;
177    /** Warc reader content-type field name id. */
178    public static final int FN_IDX_CONTENT_TYPE = 5;
179    /** Warc reader warc-concurrent-to field name id. */
180    public static final int FN_IDX_WARC_CONCURRENT_TO = 6;
181    /** Warc reader warc-block-digest field name id. */
182    public static final int FN_IDX_WARC_BLOCK_DIGEST = 7;
183    /** Warc reader warc-payload-digest field name id. */
184    public static final int FN_IDX_WARC_PAYLOAD_DIGEST = 8;
185    /** Warc reader warc-ip-address field name id. */
186    public static final int FN_IDX_WARC_IP_ADDRESS = 9;
187    /** Warc reader warc-refers-to field name id. */
188    public static final int FN_IDX_WARC_REFERS_TO = 10;
189    /** Warc reader warc-target-uri field name id. */
190    public static final int FN_IDX_WARC_TARGET_URI = 11;
191    /** Warc reader warc-truncated field name id. */
192    public static final int FN_IDX_WARC_TRUNCATED = 12;
193    /** Warc reader warc-warcinfo-id field name id. */
194    public static final int FN_IDX_WARC_WARCINFO_ID = 13;
195    /** Warc reader warc-filename field name id. */
196    public static final int FN_IDX_WARC_FILENAME = 14;                    // warcinfo only
197    /** Warc reader warc-profile field name id. */
198    public static final int FN_IDX_WARC_PROFILE = 15;                    // revisit only
199    /** Warc reader warc-identified-payload-type field name id. */
200    public static final int FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE = 16;
201    /** Warc reader warc-segment-origin-id field name id. */
202    public static final int FN_IDX_WARC_SEGMENT_ORIGIN_ID = 17;            // continuation only
203    /** Warc reader warc-segment-number field name id. */
204    public static final int FN_IDX_WARC_SEGMENT_NUMBER = 18;
205    /** Warc reader warc-segment-totalt-length field name id. */
206    public static final int FN_IDX_WARC_SEGMENT_TOTAL_LENGTH = 19;        //continuation only
207    /** WARC-Refers-To-Target-URI field name id. */
208    public static final int FN_IDX_WARC_REFERS_TO_TARGET_URI = 20;
209    /** WARC-Refers-To-Date field name id. */
210    public static final int FN_IDX_WARC_REFERS_TO_DATE = 21;
211
212    /** Map used to identify known warc field names. */
213    public static final Map<String, Integer> fieldNameIdxMap = new HashMap<String, Integer>();
214
215    /**
216     * Populate map of known WARC field names.
217     */
218    static {
219        fieldNameIdxMap.put(FN_WARC_TYPE.toLowerCase(), FN_IDX_WARC_TYPE);
220        fieldNameIdxMap.put(FN_WARC_RECORD_ID.toLowerCase(), FN_IDX_WARC_RECORD_ID);
221        fieldNameIdxMap.put(FN_WARC_DATE.toLowerCase(), FN_IDX_WARC_DATE);
222        fieldNameIdxMap.put(FN_CONTENT_LENGTH.toLowerCase(), FN_IDX_CONTENT_LENGTH);
223        fieldNameIdxMap.put(FN_CONTENT_TYPE.toLowerCase(), FN_IDX_CONTENT_TYPE);
224        fieldNameIdxMap.put(FN_WARC_CONCURRENT_TO.toLowerCase(), FN_IDX_WARC_CONCURRENT_TO);
225        fieldNameIdxMap.put(FN_WARC_BLOCK_DIGEST.toLowerCase(), FN_IDX_WARC_BLOCK_DIGEST);
226        fieldNameIdxMap.put(FN_WARC_PAYLOAD_DIGEST.toLowerCase(), FN_IDX_WARC_PAYLOAD_DIGEST);
227        fieldNameIdxMap.put(FN_WARC_IP_ADDRESS.toLowerCase(), FN_IDX_WARC_IP_ADDRESS);
228        fieldNameIdxMap.put(FN_WARC_REFERS_TO.toLowerCase(), FN_IDX_WARC_REFERS_TO);
229        fieldNameIdxMap.put(FN_WARC_TARGET_URI.toLowerCase(), FN_IDX_WARC_TARGET_URI);
230        fieldNameIdxMap.put(FN_WARC_TRUNCATED.toLowerCase(), FN_IDX_WARC_TRUNCATED);
231        fieldNameIdxMap.put(FN_WARC_WARCINFO_ID.toLowerCase(), FN_IDX_WARC_WARCINFO_ID);
232        fieldNameIdxMap.put(FN_WARC_FILENAME.toLowerCase(), FN_IDX_WARC_FILENAME);
233        fieldNameIdxMap.put(FN_WARC_PROFILE.toLowerCase(), FN_IDX_WARC_PROFILE);
234        fieldNameIdxMap.put(FN_WARC_IDENTIFIED_PAYLOAD_TYPE.toLowerCase(), FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE);
235        fieldNameIdxMap.put(FN_WARC_SEGMENT_ORIGIN_ID.toLowerCase(), FN_IDX_WARC_SEGMENT_ORIGIN_ID);
236        fieldNameIdxMap.put(FN_WARC_SEGMENT_NUMBER.toLowerCase(), FN_IDX_WARC_SEGMENT_NUMBER);
237        fieldNameIdxMap.put(FN_WARC_SEGMENT_TOTAL_LENGTH.toLowerCase(), FN_IDX_WARC_SEGMENT_TOTAL_LENGTH);
238        fieldNameIdxMap.put(FN_WARC_REFERS_TO_TARGET_URI.toLowerCase(), FN_IDX_WARC_REFERS_TO_TARGET_URI);
239        fieldNameIdxMap.put(FN_WARC_REFERS_TO_DATE.toLowerCase(), FN_IDX_WARC_REFERS_TO_DATE);
240    }
241
242    /** WARC String field datatype identifier. */
243    public static final int FDT_STRING = 0;
244    /** WARC Integer field datatype identifier. */
245    public static final int FDT_INTEGER = 1;
246    /** WARC Long field datatype identifier. */
247    public static final int FDT_LONG = 2;
248    /** WARC Digest field datatype identifier. */
249    public static final int FDT_DIGEST = 3;
250    /** WARC ContentType field datatype identifier. */
251    public static final int FDT_CONTENTTYPE = 4;
252    /** WARC Date field datatype identifier. */
253    public static final int FDT_DATE = 5;
254    /** WARC InetAddress field datatype identifier. */
255    public static final int FDT_INETADDRESS = 6;
256    /** WARC URI field datatype identifier. */
257    public static final int FDT_URI = 7;
258
259    /** WARC field datatype id to field datatype name mapping table. */
260    public static final String[] FDT_IDX_STRINGS = {
261        "String",
262        "Integer",
263        "Long",
264        "Digest",
265        "ContentType",
266        "Date",
267        "InetAddress",
268        "URI"
269    };
270
271    /** Array to lookup WARC field datatypes. */
272    public static final int[] FN_IDX_DT = {
273        -1,
274        FDT_STRING,
275        FDT_URI,
276        FDT_DATE,
277        FDT_LONG,
278        FDT_CONTENTTYPE,
279        FDT_URI,
280        FDT_DIGEST,
281        FDT_DIGEST,
282        FDT_INETADDRESS,
283        FDT_URI,
284        FDT_URI,
285        FDT_STRING,
286        FDT_URI,
287        FDT_STRING,
288        FDT_URI,
289        FDT_CONTENTTYPE,
290        FDT_URI,
291        FDT_INTEGER,
292        FDT_LONG,
293        FDT_URI,
294        FDT_DATE
295    };
296
297    /*
298     * WARC fields that can have multiple occurrences in a Warc header.
299     */
300
301    /** Lookup table of Warc fields that can have multiple occurrences. */
302    public static final boolean[] fieldNamesRepeatableLookup = new boolean[FN_INDEX_OF_LAST];
303
304    /**
305     * Populate multiple occurrences lookup table.
306     */
307    static {
308        fieldNamesRepeatableLookup[FN_IDX_WARC_CONCURRENT_TO] = true;
309    }
310
311    /*
312     * WARC record types.
313     */
314
315    /** WARC-Type warcinfo id. */
316    public static final String RT_WARCINFO = "warcinfo";
317    /** WARC-Type response id. */
318    public static final String RT_RESPONSE = "response";
319    /** WARC-Type resource id. */
320    public static final String RT_RESOURCE = "resource";
321    /** WARC-Type request id. */
322    public static final String RT_REQUEST = "request";
323    /** WARC-Type metadata id. */
324    public static final String RT_METADATA = "metadata";
325    /** WARC-Type revisit id. */
326    public static final String RT_REVISIT = "revisit";
327    /** WARC-Type conversion id. */
328    public static final String RT_CONVERSION = "conversion";
329    /** WARC-Type continuation id. */
330    public static final String RT_CONTINUATION = "continuation";
331
332    /** WARC type id to field name mapping table.
333     *  Zero indexed array with all indexes used &gt; 1. (Index 0 is unused) */
334    public static final String[] RT_IDX_STRINGS = {
335        null,
336        RT_WARCINFO,
337        RT_RESPONSE,
338        RT_RESOURCE,
339        RT_REQUEST,
340        RT_METADATA,
341        RT_REVISIT,
342        RT_CONVERSION,
343        RT_CONTINUATION
344    };
345
346    /** Warc reader unknown warc record type id. */
347    public static final int RT_IDX_UNKNOWN = 0;
348    /** Warc reader warcinfo warc record type id. */
349    public static final int RT_IDX_WARCINFO = 1;
350    /** Warc reader response warc record type id. */
351    public static final int RT_IDX_RESPONSE = 2;
352    /** Warc reader resource warc record type id. */
353    public static final int RT_IDX_RESOURCE = 3;
354    /** Warc reader request warc record type id. */
355    public static final int RT_IDX_REQUEST = 4;
356    /** Warc reader metadata warc record type id. */
357    public static final int RT_IDX_METADATA = 5;
358    /** Warc reader revisit warc record type id. */
359    public static final int RT_IDX_REVISIT = 6;
360    /** Warc reader conversion warc record type id. */
361    public static final int RT_IDX_CONVERSION = 7;
362    /** Warc reader continuation warc record type id. */
363    public static final int RT_IDX_CONTINUATION = 8;
364
365    /** WARC-Type lookup map. */
366    public static final Map<String, Integer> recordTypeIdxMap = new HashMap<String, Integer>();
367
368    /**
369     * Populate WARC-Type lookup map.
370     */
371    static {
372        recordTypeIdxMap.put(RT_WARCINFO.toLowerCase(), RT_IDX_WARCINFO);
373        recordTypeIdxMap.put(RT_RESPONSE.toLowerCase(), RT_IDX_RESPONSE);
374        recordTypeIdxMap.put(RT_RESOURCE.toLowerCase(), RT_IDX_RESOURCE);
375        recordTypeIdxMap.put(RT_REQUEST.toLowerCase(), RT_IDX_REQUEST);
376        recordTypeIdxMap.put(RT_METADATA.toLowerCase(), RT_IDX_METADATA);
377        recordTypeIdxMap.put(RT_REVISIT.toLowerCase(), RT_IDX_REVISIT);
378        recordTypeIdxMap.put(RT_CONVERSION.toLowerCase(), RT_IDX_CONVERSION);
379        recordTypeIdxMap.put(RT_CONTINUATION.toLowerCase(), RT_IDX_CONTINUATION);
380    }
381
382    /*
383     * Truncation reason types.
384     */
385
386    /** WARC-Truncated length id. */
387    public static final String TT_LENGTH = "length";
388    /** WARC-Truncated time id*/
389    public static final String TT_TIME = "time";
390    /** WARC-Truncated disconnect id. */
391    public static final String TT_DISCONNECT = "disconnect";
392    /** WARC-Truncated unspecified id. */
393    public static final String TT_UNSPECIFIED = "unspecified";
394
395    /** WARC truncation reason id to field name mapping table.
396     *  Zero indexed array with all indexes used &gt; 1. (Index 0 is unused) */
397    public static final String[] TT_IDX_STRINGS = {
398        null,
399        TT_LENGTH,
400        TT_TIME,
401        TT_DISCONNECT,
402        TT_UNSPECIFIED
403    };
404
405    /** Warc reader future reason id. */
406    public static final int TT_IDX_FUTURE_REASON = 0;
407    /** Warc reader length reason id. */
408    public static final int TT_IDX_LENGTH = 1;
409    /** Warc reader time reason id. */
410    public static final int TT_IDX_TIME = 2;
411    /** Warc reader disconnect reason id. */
412    public static final int TT_IDX_DISCONNECT = 3;
413    /** Warc reader unspecified reason id. */
414    public static final int TT_IDX_UNSPECIFIED = 4;
415
416    /** Lookup map for known truncation reason id's. */
417    public static final Map<String, Integer> truncatedTypeIdxMap = new HashMap<String, Integer>();
418
419    /**
420     * Populate truncation reason id lookup map.
421     */
422    static {
423        truncatedTypeIdxMap.put(TT_LENGTH.toLowerCase(), TT_IDX_LENGTH);
424        truncatedTypeIdxMap.put(TT_TIME.toLowerCase(), TT_IDX_TIME);
425        truncatedTypeIdxMap.put(TT_DISCONNECT.toLowerCase(), TT_IDX_DISCONNECT);
426        truncatedTypeIdxMap.put(TT_UNSPECIFIED.toLowerCase(), TT_IDX_UNSPECIFIED);
427    }
428
429    /*
430     * Warc revisit profile ids used in the WARC-Profile header (See ISO).
431     */
432
433    /** Revisit WARC-Profile id for identical payload digest. */
434    public static final String PROFILE_IDENTICAL_PAYLOAD_DIGEST =
435            "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest";
436
437    /** Revisit WARC-Profile id for server not modified. */
438    public static final String PROFILE_SERVER_NOT_MODIFIED =
439            "http://netpreserve.org/warc/1.0/revisit/server-not-modified";
440
441    /** WARC profile id to field name mapping table.
442     *  Zero indexed array with all indexes used &gt; 1. (Index 0 is unused) */
443    public static final String[] P_IDX_STRINGS = {
444        null,
445        PROFILE_IDENTICAL_PAYLOAD_DIGEST,
446        PROFILE_SERVER_NOT_MODIFIED
447    };
448
449    /*
450     * Warc revisit profile ids returned by the warc reader.
451     * The raw value is also available in case of unknown profiles.
452     */
453
454    /** Warc reader id for unknown profile. */
455    public static final int PROFILE_IDX_UNKNOWN = 0;
456    /** Warc reader id for identical payload digest profile. */
457    public static final int PROFILE_IDX_IDENTICAL_PAYLOAD_DIGEST = 1;
458    /** Warc reader id for server not modified profile. */
459    public static final int PROFILE_IDX_SERVER_NOT_MODIFIED = 2;
460
461    /** Profile lookup map used to identify WARC-Profile values. */
462    public static final Map<String, Integer> profileIdxMap = new HashMap<String, Integer>();
463
464    /**
465     * Populate the lookup map with known WARC-Profile ids.
466     */
467    static {
468        profileIdxMap.put(PROFILE_IDENTICAL_PAYLOAD_DIGEST.toLowerCase(),
469                PROFILE_IDX_IDENTICAL_PAYLOAD_DIGEST);
470        profileIdxMap.put(PROFILE_SERVER_NOT_MODIFIED.toLowerCase(),
471                PROFILE_IDX_SERVER_NOT_MODIFIED);
472    }
473
474    /*
475     * The different requirement levels as per RFC 2119.
476     * (See http://www.ietf.org/rfc/rfc2119.txt)
477     */
478
479    /** Warc header can be ignored. */
480    public static final int POLICY_IGNORE = 0;
481    /** Warc header is mandatory (equal to shall). */
482    public static final int POLICY_MANDATORY = 1;
483    /** Warc header must be present. */
484    public static final int POLICY_SHALL = 2;
485    /** Warc header must not be present. */
486    public static final int POLICY_SHALL_NOT = 3;
487    /** Warc header can be present. */
488    public static final int POLICY_MAY = 4;
489    /** Warc header should not be present. */
490    public static final int POLICY_MAY_NOT = 5;
491
492    /** A (Warc-Types x Warc-Header-Fields) matrix used for policy validation.
493     *  (See below) */
494    public static final int[][] field_policy;
495
496    /**
497     * The following section initializes the policy matrix used to check the
498     * usage of each known warc header line against each known warc record
499     * type.
500     * The ISO standard was used to build the data in the matrix.
501     */
502    static {
503        field_policy = new int[RT_INDEX_OF_LAST][FN_INDEX_OF_LAST];
504
505        // Warc-Record-id
506        // Warc-Type
507        // Warc-Date
508        // Content-Length
509        // Also required for unknown warc-types.
510        for (int i=0; i<=RT_NUMBER; ++i) {
511            field_policy[i][FN_IDX_WARC_RECORD_ID] = POLICY_MANDATORY;
512            field_policy[i][FN_IDX_WARC_TYPE] = POLICY_MANDATORY;
513            field_policy[i][FN_IDX_WARC_DATE] = POLICY_MANDATORY;
514            field_policy[i][FN_IDX_CONTENT_LENGTH] = POLICY_MANDATORY;
515        }
516
517        // Content-Type
518        field_policy[RT_IDX_CONTINUATION][FN_IDX_CONTENT_TYPE] = POLICY_SHALL_NOT;
519
520        // Warc-Ip-Address
521        field_policy[RT_IDX_REQUEST][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
522        field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
523        field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
524        field_policy[RT_IDX_METADATA][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
525        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
526        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT;
527        field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT;
528        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT;
529
530        // Warc-Concurrent-To
531        field_policy[RT_IDX_REQUEST][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
532        field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
533        field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
534        field_policy[RT_IDX_METADATA][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
535        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
536        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT;
537        field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT;
538        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT;
539
540        // Warc-Refers-To
541        field_policy[RT_IDX_METADATA][FN_IDX_WARC_REFERS_TO] = POLICY_MAY;
542        field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_REFERS_TO] = POLICY_MAY;
543        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO] = POLICY_MAY;
544        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
545        field_policy[RT_IDX_REQUEST][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
546        field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
547        field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
548        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
549
550        // Warc-Target-Uri
551        field_policy[RT_IDX_REQUEST][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
552        field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
553        field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
554        field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
555        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
556        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
557        field_policy[RT_IDX_METADATA][FN_IDX_WARC_TARGET_URI] = POLICY_MAY;
558        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL_NOT;
559
560        // Warc-Warcinfo-Id
561        // Warc-Filename
562        // Warc-Profile
563        // Warc-Segment-Origin-Id
564        // Warc-Segment-Total-Length
565        for (int i=1; i<=RT_NUMBER; ++i) {
566            field_policy[i][FN_IDX_WARC_WARCINFO_ID] = POLICY_MAY;
567            field_policy[i][FN_IDX_WARC_FILENAME] = POLICY_SHALL_NOT;
568            field_policy[i][FN_IDX_WARC_PROFILE] = POLICY_IGNORE;
569            field_policy[i][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_SHALL_NOT;
570            field_policy[i][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_SHALL_NOT;
571        }
572        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_WARCINFO_ID] = POLICY_MAY_NOT;
573        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_FILENAME] = POLICY_MAY;
574        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_PROFILE] = POLICY_MANDATORY;
575        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_MANDATORY;
576
577        // Warc-Segment-Number
578        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_SEGMENT_NUMBER] = POLICY_MANDATORY;
579
580        // WARC-Refers-To-Target-URI
581        // WARC-Refers-To-Date
582        for (int i=1; i<=RT_NUMBER; ++i) {
583            field_policy[i][FN_IDX_WARC_REFERS_TO_TARGET_URI] = POLICY_SHALL_NOT;
584            field_policy[i][FN_IDX_WARC_REFERS_TO_DATE] = POLICY_SHALL_NOT;
585        }
586        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO_TARGET_URI] = POLICY_MAY;
587        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO_DATE] = POLICY_MAY;
588    }
589
590}