001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import java.util.HashMap; 021import java.util.Map; 022 023/** 024 * Class containing all relevant WARC constants and structures. 025 * Including but not limited to field names and mime-types. 026 * Also includes statically initialized structures for validation. 027 * 028 * @author nicl 029 */ 030public class WarcConstants { 031 032 /** 033 * This utility class does not require instantiation. 034 */ 035 protected WarcConstants() { 036 } 037 038 /** 039 * A WARC header block starts with this string including trailing version 040 * information. 041 * */ 042 public static final String WARC_MAGIC_HEADER = "WARC/"; 043 044 /** End mark used after each record consisting of two newlines. */ 045 protected static byte[] endMark = "\r\n\r\n".getBytes(); 046 047 /** WARC date format string as specified by the WARC ISO standard. */ 048 public static final String WARC_DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'"; 049 050 /** WARC digest format string as specified by the WARC ISO standard. */ 051 public static final String WARC_DIGEST_FORMAT = "<digest-algorithm>:<digest-encoded>"; 052 053 /** Content-type format string as specified in RFC2616.*/ 054 public static final String CONTENT_TYPE_FORMAT = "<type>/<sub-type>(; <argument>=<value>)*"; 055 056 /* 057 * WARC content-types (MIME). 058 */ 059 060 /** WARC mime type. */ 061 public static final String WARC_MIME_TYPE = "application/warc"; 062 063 /** Suggested content-type/media-type for metadata records and others. */ 064 public static final String CT_APP_WARC_FIELDS = "application/warc-fields"; 065 066 /** Suggested content-type for metadata records and others. */ 067 public static final String CONTENT_TYPE_METADATA = "application"; 068 069 /** Suggested media-type for metadata records and others. */ 070 public static final String MEDIA_TYPE_METADATA = "warc-fields"; 071 072 //"text/dns" 073 //"application/http;msgtype=request" 074 //"application/http;msgtype=response" 075 076 /* 077 * Voodoo magic constants. 078 */ 079 080 /** Trailing newlines after each record as per the WARC ISO standard. */ 081 public static final int WARC_RECORD_TRAILING_NEWLINES = 2; 082 083 /** Number of WARC fields. */ 084 public static final int FN_NUMBER = 21; 085 086 /** Index of last WARC field (zero-indexed). */ 087 public static final int FN_INDEX_OF_LAST = FN_NUMBER+1; 088 089 /** Number of WARC types. */ 090 public static final int RT_NUMBER = 8; 091 092 /** Index of last WARC type (zero indexed). */ 093 public static final int RT_INDEX_OF_LAST = RT_NUMBER+1; 094 095 /* 096 * WARC field names. 097 */ 098 099 /** Warc-type field name. */ 100 public static final String FN_WARC_TYPE = "WARC-Type"; 101 /** Warc-record-id field name. */ 102 public static final String FN_WARC_RECORD_ID = "WARC-Record-ID"; 103 /** Warc-date field name. */ 104 public static final String FN_WARC_DATE = "WARC-Date"; 105 /** Content-length field name. */ 106 public static final String FN_CONTENT_LENGTH = "Content-Length"; 107 /** Content-type field name. */ 108 public static final String FN_CONTENT_TYPE = "Content-Type"; 109 /** Warc-concurrent-to field name. */ 110 public static final String FN_WARC_CONCURRENT_TO = "WARC-Concurrent-To"; 111 /** Warc-block-digest field name. */ 112 public static final String FN_WARC_BLOCK_DIGEST = "WARC-Block-Digest"; 113 /** Warc-payload-digest field name. */ 114 public static final String FN_WARC_PAYLOAD_DIGEST = "WARC-Payload-Digest"; 115 /** Warc-ip-address field name. */ 116 public static final String FN_WARC_IP_ADDRESS = "WARC-IP-Address"; 117 /** Warc-refers-to field name. */ 118 public static final String FN_WARC_REFERS_TO = "WARC-Refers-To"; 119 /** Warc-target-uri field name. */ 120 public static final String FN_WARC_TARGET_URI = "WARC-Target-URI"; 121 /** Warc-truncated field name. */ 122 public static final String FN_WARC_TRUNCATED = "WARC-Truncated"; 123 /** Warc-warcinfo-id field name. */ 124 public static final String FN_WARC_WARCINFO_ID = "WARC-Warcinfo-ID"; 125 /** Warc-filename field name. */ 126 public static final String FN_WARC_FILENAME = "WARC-Filename"; 127 /** Warc-profile field name. */ 128 public static final String FN_WARC_PROFILE = "WARC-Profile"; 129 /** Warc-identified-payload-type field name. */ 130 public static final String FN_WARC_IDENTIFIED_PAYLOAD_TYPE = "WARC-Identified-Payload-Type"; 131 /** Warc-segment-origin-id field name. */ 132 public static final String FN_WARC_SEGMENT_ORIGIN_ID = "WARC-Segment-Origin-ID"; 133 /** Warc-segment-number field name. */ 134 public static final String FN_WARC_SEGMENT_NUMBER = "WARC-Segment-Number"; 135 /** Warc-segment-totalt-length field name. */ 136 public static final String FN_WARC_SEGMENT_TOTAL_LENGTH = "WARC-Segment-Total-Length"; 137 /** WARC-Refers-To-Target-URI field name. */ 138 public static final String FN_WARC_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI"; 139 /** WARC-Refers-To-Date field name. */ 140 public static final String FN_WARC_REFERS_TO_DATE = "WARC-Refers-To-Date"; 141 142 /** WARC field name id to field name mapping table. 143 * Zero indexed array with all indexes used > 1. (Index 0 is unused) */ 144 public static final String[] FN_IDX_STRINGS = { 145 null, 146 FN_WARC_TYPE, 147 FN_WARC_RECORD_ID, 148 FN_WARC_DATE, 149 FN_CONTENT_LENGTH, 150 FN_CONTENT_TYPE, 151 FN_WARC_CONCURRENT_TO, 152 FN_WARC_BLOCK_DIGEST, 153 FN_WARC_PAYLOAD_DIGEST, 154 FN_WARC_IP_ADDRESS, 155 FN_WARC_REFERS_TO, 156 FN_WARC_TARGET_URI, 157 FN_WARC_TRUNCATED, 158 FN_WARC_WARCINFO_ID, 159 FN_WARC_FILENAME, 160 FN_WARC_PROFILE, 161 FN_WARC_IDENTIFIED_PAYLOAD_TYPE, 162 FN_WARC_SEGMENT_ORIGIN_ID, 163 FN_WARC_SEGMENT_NUMBER, 164 FN_WARC_SEGMENT_TOTAL_LENGTH, 165 FN_WARC_REFERS_TO_TARGET_URI, 166 FN_WARC_REFERS_TO_DATE 167 }; 168 169 /** Warc reader warc-type field name id. */ 170 public static final int FN_IDX_WARC_TYPE = 1; 171 /** Warc reader warc-record-id field name id. */ 172 public static final int FN_IDX_WARC_RECORD_ID = 2; 173 /** Warc reader warc-date field name id. */ 174 public static final int FN_IDX_WARC_DATE = 3; 175 /** Warc reader content-length field name id. */ 176 public static final int FN_IDX_CONTENT_LENGTH = 4; 177 /** Warc reader content-type field name id. */ 178 public static final int FN_IDX_CONTENT_TYPE = 5; 179 /** Warc reader warc-concurrent-to field name id. */ 180 public static final int FN_IDX_WARC_CONCURRENT_TO = 6; 181 /** Warc reader warc-block-digest field name id. */ 182 public static final int FN_IDX_WARC_BLOCK_DIGEST = 7; 183 /** Warc reader warc-payload-digest field name id. */ 184 public static final int FN_IDX_WARC_PAYLOAD_DIGEST = 8; 185 /** Warc reader warc-ip-address field name id. */ 186 public static final int FN_IDX_WARC_IP_ADDRESS = 9; 187 /** Warc reader warc-refers-to field name id. */ 188 public static final int FN_IDX_WARC_REFERS_TO = 10; 189 /** Warc reader warc-target-uri field name id. */ 190 public static final int FN_IDX_WARC_TARGET_URI = 11; 191 /** Warc reader warc-truncated field name id. */ 192 public static final int FN_IDX_WARC_TRUNCATED = 12; 193 /** Warc reader warc-warcinfo-id field name id. */ 194 public static final int FN_IDX_WARC_WARCINFO_ID = 13; 195 /** Warc reader warc-filename field name id. */ 196 public static final int FN_IDX_WARC_FILENAME = 14; // warcinfo only 197 /** Warc reader warc-profile field name id. */ 198 public static final int FN_IDX_WARC_PROFILE = 15; // revisit only 199 /** Warc reader warc-identified-payload-type field name id. */ 200 public static final int FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE = 16; 201 /** Warc reader warc-segment-origin-id field name id. */ 202 public static final int FN_IDX_WARC_SEGMENT_ORIGIN_ID = 17; // continuation only 203 /** Warc reader warc-segment-number field name id. */ 204 public static final int FN_IDX_WARC_SEGMENT_NUMBER = 18; 205 /** Warc reader warc-segment-totalt-length field name id. */ 206 public static final int FN_IDX_WARC_SEGMENT_TOTAL_LENGTH = 19; //continuation only 207 /** WARC-Refers-To-Target-URI field name id. */ 208 public static final int FN_IDX_WARC_REFERS_TO_TARGET_URI = 20; 209 /** WARC-Refers-To-Date field name id. */ 210 public static final int FN_IDX_WARC_REFERS_TO_DATE = 21; 211 212 /** Map used to identify known warc field names. */ 213 public static final Map<String, Integer> fieldNameIdxMap = new HashMap<String, Integer>(); 214 215 /** 216 * Populate map of known WARC field names. 217 */ 218 static { 219 fieldNameIdxMap.put(FN_WARC_TYPE.toLowerCase(), FN_IDX_WARC_TYPE); 220 fieldNameIdxMap.put(FN_WARC_RECORD_ID.toLowerCase(), FN_IDX_WARC_RECORD_ID); 221 fieldNameIdxMap.put(FN_WARC_DATE.toLowerCase(), FN_IDX_WARC_DATE); 222 fieldNameIdxMap.put(FN_CONTENT_LENGTH.toLowerCase(), FN_IDX_CONTENT_LENGTH); 223 fieldNameIdxMap.put(FN_CONTENT_TYPE.toLowerCase(), FN_IDX_CONTENT_TYPE); 224 fieldNameIdxMap.put(FN_WARC_CONCURRENT_TO.toLowerCase(), FN_IDX_WARC_CONCURRENT_TO); 225 fieldNameIdxMap.put(FN_WARC_BLOCK_DIGEST.toLowerCase(), FN_IDX_WARC_BLOCK_DIGEST); 226 fieldNameIdxMap.put(FN_WARC_PAYLOAD_DIGEST.toLowerCase(), FN_IDX_WARC_PAYLOAD_DIGEST); 227 fieldNameIdxMap.put(FN_WARC_IP_ADDRESS.toLowerCase(), FN_IDX_WARC_IP_ADDRESS); 228 fieldNameIdxMap.put(FN_WARC_REFERS_TO.toLowerCase(), FN_IDX_WARC_REFERS_TO); 229 fieldNameIdxMap.put(FN_WARC_TARGET_URI.toLowerCase(), FN_IDX_WARC_TARGET_URI); 230 fieldNameIdxMap.put(FN_WARC_TRUNCATED.toLowerCase(), FN_IDX_WARC_TRUNCATED); 231 fieldNameIdxMap.put(FN_WARC_WARCINFO_ID.toLowerCase(), FN_IDX_WARC_WARCINFO_ID); 232 fieldNameIdxMap.put(FN_WARC_FILENAME.toLowerCase(), FN_IDX_WARC_FILENAME); 233 fieldNameIdxMap.put(FN_WARC_PROFILE.toLowerCase(), FN_IDX_WARC_PROFILE); 234 fieldNameIdxMap.put(FN_WARC_IDENTIFIED_PAYLOAD_TYPE.toLowerCase(), FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE); 235 fieldNameIdxMap.put(FN_WARC_SEGMENT_ORIGIN_ID.toLowerCase(), FN_IDX_WARC_SEGMENT_ORIGIN_ID); 236 fieldNameIdxMap.put(FN_WARC_SEGMENT_NUMBER.toLowerCase(), FN_IDX_WARC_SEGMENT_NUMBER); 237 fieldNameIdxMap.put(FN_WARC_SEGMENT_TOTAL_LENGTH.toLowerCase(), FN_IDX_WARC_SEGMENT_TOTAL_LENGTH); 238 fieldNameIdxMap.put(FN_WARC_REFERS_TO_TARGET_URI.toLowerCase(), FN_IDX_WARC_REFERS_TO_TARGET_URI); 239 fieldNameIdxMap.put(FN_WARC_REFERS_TO_DATE.toLowerCase(), FN_IDX_WARC_REFERS_TO_DATE); 240 } 241 242 /** WARC String field datatype identifier. */ 243 public static final int FDT_STRING = 0; 244 /** WARC Integer field datatype identifier. */ 245 public static final int FDT_INTEGER = 1; 246 /** WARC Long field datatype identifier. */ 247 public static final int FDT_LONG = 2; 248 /** WARC Digest field datatype identifier. */ 249 public static final int FDT_DIGEST = 3; 250 /** WARC ContentType field datatype identifier. */ 251 public static final int FDT_CONTENTTYPE = 4; 252 /** WARC Date field datatype identifier. */ 253 public static final int FDT_DATE = 5; 254 /** WARC InetAddress field datatype identifier. */ 255 public static final int FDT_INETADDRESS = 6; 256 /** WARC URI field datatype identifier. */ 257 public static final int FDT_URI = 7; 258 259 /** WARC field datatype id to field datatype name mapping table. */ 260 public static final String[] FDT_IDX_STRINGS = { 261 "String", 262 "Integer", 263 "Long", 264 "Digest", 265 "ContentType", 266 "Date", 267 "InetAddress", 268 "URI" 269 }; 270 271 /** Array to lookup WARC field datatypes. */ 272 public static final int[] FN_IDX_DT = { 273 -1, 274 FDT_STRING, 275 FDT_URI, 276 FDT_DATE, 277 FDT_LONG, 278 FDT_CONTENTTYPE, 279 FDT_URI, 280 FDT_DIGEST, 281 FDT_DIGEST, 282 FDT_INETADDRESS, 283 FDT_URI, 284 FDT_URI, 285 FDT_STRING, 286 FDT_URI, 287 FDT_STRING, 288 FDT_URI, 289 FDT_CONTENTTYPE, 290 FDT_URI, 291 FDT_INTEGER, 292 FDT_LONG, 293 FDT_URI, 294 FDT_DATE 295 }; 296 297 /* 298 * WARC fields that can have multiple occurrences in a Warc header. 299 */ 300 301 /** Lookup table of Warc fields that can have multiple occurrences. */ 302 public static final boolean[] fieldNamesRepeatableLookup = new boolean[FN_INDEX_OF_LAST]; 303 304 /** 305 * Populate multiple occurrences lookup table. 306 */ 307 static { 308 fieldNamesRepeatableLookup[FN_IDX_WARC_CONCURRENT_TO] = true; 309 } 310 311 /* 312 * WARC record types. 313 */ 314 315 /** WARC-Type warcinfo id. */ 316 public static final String RT_WARCINFO = "warcinfo"; 317 /** WARC-Type response id. */ 318 public static final String RT_RESPONSE = "response"; 319 /** WARC-Type resource id. */ 320 public static final String RT_RESOURCE = "resource"; 321 /** WARC-Type request id. */ 322 public static final String RT_REQUEST = "request"; 323 /** WARC-Type metadata id. */ 324 public static final String RT_METADATA = "metadata"; 325 /** WARC-Type revisit id. */ 326 public static final String RT_REVISIT = "revisit"; 327 /** WARC-Type conversion id. */ 328 public static final String RT_CONVERSION = "conversion"; 329 /** WARC-Type continuation id. */ 330 public static final String RT_CONTINUATION = "continuation"; 331 332 /** WARC type id to field name mapping table. 333 * Zero indexed array with all indexes used > 1. (Index 0 is unused) */ 334 public static final String[] RT_IDX_STRINGS = { 335 null, 336 RT_WARCINFO, 337 RT_RESPONSE, 338 RT_RESOURCE, 339 RT_REQUEST, 340 RT_METADATA, 341 RT_REVISIT, 342 RT_CONVERSION, 343 RT_CONTINUATION 344 }; 345 346 /** Warc reader unknown warc record type id. */ 347 public static final int RT_IDX_UNKNOWN = 0; 348 /** Warc reader warcinfo warc record type id. */ 349 public static final int RT_IDX_WARCINFO = 1; 350 /** Warc reader response warc record type id. */ 351 public static final int RT_IDX_RESPONSE = 2; 352 /** Warc reader resource warc record type id. */ 353 public static final int RT_IDX_RESOURCE = 3; 354 /** Warc reader request warc record type id. */ 355 public static final int RT_IDX_REQUEST = 4; 356 /** Warc reader metadata warc record type id. */ 357 public static final int RT_IDX_METADATA = 5; 358 /** Warc reader revisit warc record type id. */ 359 public static final int RT_IDX_REVISIT = 6; 360 /** Warc reader conversion warc record type id. */ 361 public static final int RT_IDX_CONVERSION = 7; 362 /** Warc reader continuation warc record type id. */ 363 public static final int RT_IDX_CONTINUATION = 8; 364 365 /** WARC-Type lookup map. */ 366 public static final Map<String, Integer> recordTypeIdxMap = new HashMap<String, Integer>(); 367 368 /** 369 * Populate WARC-Type lookup map. 370 */ 371 static { 372 recordTypeIdxMap.put(RT_WARCINFO.toLowerCase(), RT_IDX_WARCINFO); 373 recordTypeIdxMap.put(RT_RESPONSE.toLowerCase(), RT_IDX_RESPONSE); 374 recordTypeIdxMap.put(RT_RESOURCE.toLowerCase(), RT_IDX_RESOURCE); 375 recordTypeIdxMap.put(RT_REQUEST.toLowerCase(), RT_IDX_REQUEST); 376 recordTypeIdxMap.put(RT_METADATA.toLowerCase(), RT_IDX_METADATA); 377 recordTypeIdxMap.put(RT_REVISIT.toLowerCase(), RT_IDX_REVISIT); 378 recordTypeIdxMap.put(RT_CONVERSION.toLowerCase(), RT_IDX_CONVERSION); 379 recordTypeIdxMap.put(RT_CONTINUATION.toLowerCase(), RT_IDX_CONTINUATION); 380 } 381 382 /* 383 * Truncation reason types. 384 */ 385 386 /** WARC-Truncated length id. */ 387 public static final String TT_LENGTH = "length"; 388 /** WARC-Truncated time id*/ 389 public static final String TT_TIME = "time"; 390 /** WARC-Truncated disconnect id. */ 391 public static final String TT_DISCONNECT = "disconnect"; 392 /** WARC-Truncated unspecified id. */ 393 public static final String TT_UNSPECIFIED = "unspecified"; 394 395 /** WARC truncation reason id to field name mapping table. 396 * Zero indexed array with all indexes used > 1. (Index 0 is unused) */ 397 public static final String[] TT_IDX_STRINGS = { 398 null, 399 TT_LENGTH, 400 TT_TIME, 401 TT_DISCONNECT, 402 TT_UNSPECIFIED 403 }; 404 405 /** Warc reader future reason id. */ 406 public static final int TT_IDX_FUTURE_REASON = 0; 407 /** Warc reader length reason id. */ 408 public static final int TT_IDX_LENGTH = 1; 409 /** Warc reader time reason id. */ 410 public static final int TT_IDX_TIME = 2; 411 /** Warc reader disconnect reason id. */ 412 public static final int TT_IDX_DISCONNECT = 3; 413 /** Warc reader unspecified reason id. */ 414 public static final int TT_IDX_UNSPECIFIED = 4; 415 416 /** Lookup map for known truncation reason id's. */ 417 public static final Map<String, Integer> truncatedTypeIdxMap = new HashMap<String, Integer>(); 418 419 /** 420 * Populate truncation reason id lookup map. 421 */ 422 static { 423 truncatedTypeIdxMap.put(TT_LENGTH.toLowerCase(), TT_IDX_LENGTH); 424 truncatedTypeIdxMap.put(TT_TIME.toLowerCase(), TT_IDX_TIME); 425 truncatedTypeIdxMap.put(TT_DISCONNECT.toLowerCase(), TT_IDX_DISCONNECT); 426 truncatedTypeIdxMap.put(TT_UNSPECIFIED.toLowerCase(), TT_IDX_UNSPECIFIED); 427 } 428 429 /* 430 * Warc revisit profile ids used in the WARC-Profile header (See ISO). 431 */ 432 433 /** Revisit WARC-Profile id for identical payload digest. */ 434 public static final String PROFILE_IDENTICAL_PAYLOAD_DIGEST = 435 "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest"; 436 437 /** Revisit WARC-Profile id for server not modified. */ 438 public static final String PROFILE_SERVER_NOT_MODIFIED = 439 "http://netpreserve.org/warc/1.0/revisit/server-not-modified"; 440 441 /** WARC profile id to field name mapping table. 442 * Zero indexed array with all indexes used > 1. (Index 0 is unused) */ 443 public static final String[] P_IDX_STRINGS = { 444 null, 445 PROFILE_IDENTICAL_PAYLOAD_DIGEST, 446 PROFILE_SERVER_NOT_MODIFIED 447 }; 448 449 /* 450 * Warc revisit profile ids returned by the warc reader. 451 * The raw value is also available in case of unknown profiles. 452 */ 453 454 /** Warc reader id for unknown profile. */ 455 public static final int PROFILE_IDX_UNKNOWN = 0; 456 /** Warc reader id for identical payload digest profile. */ 457 public static final int PROFILE_IDX_IDENTICAL_PAYLOAD_DIGEST = 1; 458 /** Warc reader id for server not modified profile. */ 459 public static final int PROFILE_IDX_SERVER_NOT_MODIFIED = 2; 460 461 /** Profile lookup map used to identify WARC-Profile values. */ 462 public static final Map<String, Integer> profileIdxMap = new HashMap<String, Integer>(); 463 464 /** 465 * Populate the lookup map with known WARC-Profile ids. 466 */ 467 static { 468 profileIdxMap.put(PROFILE_IDENTICAL_PAYLOAD_DIGEST.toLowerCase(), 469 PROFILE_IDX_IDENTICAL_PAYLOAD_DIGEST); 470 profileIdxMap.put(PROFILE_SERVER_NOT_MODIFIED.toLowerCase(), 471 PROFILE_IDX_SERVER_NOT_MODIFIED); 472 } 473 474 /* 475 * The different requirement levels as per RFC 2119. 476 * (See http://www.ietf.org/rfc/rfc2119.txt) 477 */ 478 479 /** Warc header can be ignored. */ 480 public static final int POLICY_IGNORE = 0; 481 /** Warc header is mandatory (equal to shall). */ 482 public static final int POLICY_MANDATORY = 1; 483 /** Warc header must be present. */ 484 public static final int POLICY_SHALL = 2; 485 /** Warc header must not be present. */ 486 public static final int POLICY_SHALL_NOT = 3; 487 /** Warc header can be present. */ 488 public static final int POLICY_MAY = 4; 489 /** Warc header should not be present. */ 490 public static final int POLICY_MAY_NOT = 5; 491 492 /** A (Warc-Types x Warc-Header-Fields) matrix used for policy validation. 493 * (See below) */ 494 public static final int[][] field_policy; 495 496 /** 497 * The following section initializes the policy matrix used to check the 498 * usage of each known warc header line against each known warc record 499 * type. 500 * The ISO standard was used to build the data in the matrix. 501 */ 502 static { 503 field_policy = new int[RT_INDEX_OF_LAST][FN_INDEX_OF_LAST]; 504 505 // Warc-Record-id 506 // Warc-Type 507 // Warc-Date 508 // Content-Length 509 // Also required for unknown warc-types. 510 for (int i=0; i<=RT_NUMBER; ++i) { 511 field_policy[i][FN_IDX_WARC_RECORD_ID] = POLICY_MANDATORY; 512 field_policy[i][FN_IDX_WARC_TYPE] = POLICY_MANDATORY; 513 field_policy[i][FN_IDX_WARC_DATE] = POLICY_MANDATORY; 514 field_policy[i][FN_IDX_CONTENT_LENGTH] = POLICY_MANDATORY; 515 } 516 517 // Content-Type 518 field_policy[RT_IDX_CONTINUATION][FN_IDX_CONTENT_TYPE] = POLICY_SHALL_NOT; 519 520 // Warc-Ip-Address 521 field_policy[RT_IDX_REQUEST][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY; 522 field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY; 523 field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY; 524 field_policy[RT_IDX_METADATA][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY; 525 field_policy[RT_IDX_REVISIT][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY; 526 field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT; 527 field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT; 528 field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT; 529 530 // Warc-Concurrent-To 531 field_policy[RT_IDX_REQUEST][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY; 532 field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY; 533 field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY; 534 field_policy[RT_IDX_METADATA][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY; 535 field_policy[RT_IDX_REVISIT][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY; 536 field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT; 537 field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT; 538 field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT; 539 540 // Warc-Refers-To 541 field_policy[RT_IDX_METADATA][FN_IDX_WARC_REFERS_TO] = POLICY_MAY; 542 field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_REFERS_TO] = POLICY_MAY; 543 field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO] = POLICY_MAY; 544 field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT; 545 field_policy[RT_IDX_REQUEST][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT; 546 field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT; 547 field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT; 548 field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT; 549 550 // Warc-Target-Uri 551 field_policy[RT_IDX_REQUEST][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL; 552 field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL; 553 field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL; 554 field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL; 555 field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL; 556 field_policy[RT_IDX_REVISIT][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL; 557 field_policy[RT_IDX_METADATA][FN_IDX_WARC_TARGET_URI] = POLICY_MAY; 558 field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL_NOT; 559 560 // Warc-Warcinfo-Id 561 // Warc-Filename 562 // Warc-Profile 563 // Warc-Segment-Origin-Id 564 // Warc-Segment-Total-Length 565 for (int i=1; i<=RT_NUMBER; ++i) { 566 field_policy[i][FN_IDX_WARC_WARCINFO_ID] = POLICY_MAY; 567 field_policy[i][FN_IDX_WARC_FILENAME] = POLICY_SHALL_NOT; 568 field_policy[i][FN_IDX_WARC_PROFILE] = POLICY_IGNORE; 569 field_policy[i][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_SHALL_NOT; 570 field_policy[i][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_SHALL_NOT; 571 } 572 field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_WARCINFO_ID] = POLICY_MAY_NOT; 573 field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_FILENAME] = POLICY_MAY; 574 field_policy[RT_IDX_REVISIT][FN_IDX_WARC_PROFILE] = POLICY_MANDATORY; 575 field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_MANDATORY; 576 577 // Warc-Segment-Number 578 field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_SEGMENT_NUMBER] = POLICY_MANDATORY; 579 580 // WARC-Refers-To-Target-URI 581 // WARC-Refers-To-Date 582 for (int i=1; i<=RT_NUMBER; ++i) { 583 field_policy[i][FN_IDX_WARC_REFERS_TO_TARGET_URI] = POLICY_SHALL_NOT; 584 field_policy[i][FN_IDX_WARC_REFERS_TO_DATE] = POLICY_SHALL_NOT; 585 } 586 field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO_TARGET_URI] = POLICY_MAY; 587 field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO_DATE] = POLICY_MAY; 588 } 589 590}