001/**
002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
003 * and GZip files. (http://jwat.org/)
004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.jwat.warc;
019
020import org.jwat.common.Base16;
021import org.jwat.common.Base32;
022import org.jwat.common.Base64;
023import org.jwat.common.ByteCountingPushBackInputStream;
024import org.jwat.common.Diagnosis;
025import org.jwat.common.DiagnosisType;
026import org.jwat.common.Diagnostics;
027import org.jwat.common.HeaderLine;
028import org.jwat.common.HttpHeader;
029import org.jwat.common.NewlineParser;
030import org.jwat.common.Payload;
031import org.jwat.common.PayloadOnClosedHandler;
032
033import java.io.Closeable;
034import java.io.IOException;
035import java.io.InputStream;
036import java.util.Arrays;
037import java.util.Collections;
038import java.util.List;
039
040/**
041 * This class represents a parsed WARC record header block including
042 * possible validation and format warnings/errors encountered in the process.
043 * The payload of the WARC record is accessible through a wrapped payload
044 * object.
045 *
046 * @author nicl
047 */
048public class WarcRecord implements PayloadOnClosedHandler, Closeable {
049
050    /** Reader instance used, required for file compliance. */
051    protected WarcReader reader;
052
053    /** Input stream used to read this record. */
054    protected ByteCountingPushBackInputStream in;
055
056    /** Is this record compliant ie. error free. */
057    protected boolean bIsCompliant;
058
059    /** WARC record parsing start offset relative to the source WARC file input
060     *  stream. Used to keep track of the uncompressed amount of bytes consumed. */
061    protected long startOffset = -1;
062
063    /** Uncompressed bytes consumed while validating this record. */
064    protected long consumed;
065
066    /** Validation errors and warnings. */
067    public final Diagnostics<Diagnosis> diagnostics = new Diagnostics<Diagnosis>();
068
069    /** Newline parser for counting/validating trailing newlines. */
070    public NewlineParser nlp = new NewlineParser();
071
072    /** Is Warc-Block-Digest valid. (Null is equal to not tested) */
073    public Boolean isValidBlockDigest = null;
074
075    /** Is Warc-Payload-Digest valid. (Null is equal to not tested) */
076    public Boolean isValidPayloadDigest = null;
077
078    /** Number of trailing newlines after record. */
079    public int trailingNewlines;
080
081    /*
082     * Header-Fields.
083     */
084
085    /** WARC header. */
086    public WarcHeader header;
087
088    /*
089     * Payload
090     */
091
092    /** Has payload been closed before. */
093    protected boolean bPayloadClosed;
094
095    /** Has record been closed before. */
096    protected boolean bClosed;
097
098    /** Payload object if any exists. */
099    protected Payload payload;
100
101    /** HTTP header content parsed from payload. */
102    protected HttpHeader httpHeader;
103
104    /** Computed block digest. */
105    public WarcDigest computedBlockDigest;
106
107    /** Computed payload digest. */
108    public WarcDigest computedPayloadDigest;
109
110    /**
111     * Non public constructor to allow unit testing.
112     */
113    protected WarcRecord() {
114    }
115
116    /**
117     * Create a <code>WarcRecord</code> and prepare it for writing.
118     * @param writer writer which will be used to write the record
119     * @return a <code>WarcRecord</code> ready to be changed and then written
120     */
121    public static WarcRecord createRecord(WarcWriter writer) {
122        WarcRecord record = new WarcRecord();
123        record.header = WarcHeader.initHeader(writer, record.diagnostics);
124        writer.fieldParsers.diagnostics = record.diagnostics;
125        return record;
126    }
127
128    /**
129     * Given an <code>InputStream</code> it tries to read and validate a WARC
130     * header block.
131     * @param in <code>InputStream</code> containing WARC record data
132     * @param reader <code>WarcReader</code> used, with access to user defined
133     * options
134     * @return <code>WarcRecord</code> or <code>null</code>
135     * @throws IOException i/o exception in the process of reading record
136     */
137    public static WarcRecord parseRecord(ByteCountingPushBackInputStream in,
138                                    WarcReader reader) throws IOException {
139        WarcRecord record = new WarcRecord();
140        record.in = in;
141        record.reader = reader;
142        record.startOffset = in.getConsumed();
143        // Initialize WarcHeader with required context.
144        record.header = WarcHeader.initHeader(reader, in.getConsumed(), record.diagnostics);
145        WarcHeader header = record.header;
146        // Initialize WarcFieldParser to report diagnoses here.
147        reader.fieldParsers.diagnostics = record.diagnostics;
148        if (header.parseHeader(in)) {
149            ++reader.records;
150            /*
151             * Payload processing.
152             */
153            if (header.contentLength != null && header.contentLength > 0) {
154                /*
155                 * Payload.
156                 */
157                String digestAlgorithm = null;
158                if (reader.bBlockDigest) {
159                    if (header.warcBlockDigest != null && header.warcBlockDigest.algorithm != null) {
160                        // If a WARC block digest header is present in the
161                        // record, use that algorithm.
162                        digestAlgorithm = header.warcBlockDigest.algorithm;
163                    } else {
164                        // If no WARC block digest header is present,
165                        // use the optional user specified algorithm.
166                        // Can be null in which case nothing is computed.
167                        digestAlgorithm = reader.blockDigestAlgorithm;
168                    }
169                }
170                record.payload = Payload.processPayload(in, header.contentLength,
171                                         reader.payloadHeaderMaxSize, digestAlgorithm);
172                record.payload.setOnClosedHandler(record);
173                /*
174                 * HttpHeader.
175                 */
176                if (header.contentType != null
177                        && header.contentType.contentType.equals("application")
178                        && header.contentType.mediaType.equals("http")) {
179                    String value = header.contentType.getParameter("msgtype");
180                    // request
181                    int httpHeaderType = 0;
182                    if ("response".equalsIgnoreCase(value)) {
183                        httpHeaderType = HttpHeader.HT_RESPONSE;
184                    } else if ("request".equalsIgnoreCase(value)) {
185                        httpHeaderType = HttpHeader.HT_REQUEST;
186                    }
187                    if (httpHeaderType != 0) {
188                        digestAlgorithm = null;
189                        if (reader.bPayloadDigest) {
190                            if (header.warcPayloadDigest != null && header.warcPayloadDigest.algorithm != null) {
191                                // If a WARC payload digest header is present in the
192                                // record, use that algorithm.
193                                digestAlgorithm = header.warcPayloadDigest.algorithm;
194                            } else {
195                                // If no WARC payload digest header is present,
196                                // use the optional user specified algorithm.
197                                // Can be null in which case nothing is computed.
198                                digestAlgorithm = reader.payloadDigestAlgorithm;
199                            }
200                        }
201                        // Try to read a valid HTTP request/response header from the payload.
202                        record.httpHeader = HttpHeader.processPayload(httpHeaderType,
203                                record.payload.getInputStream(), header.contentLength,
204                                digestAlgorithm);
205                        if (record.httpHeader != null) {
206                            if (record.httpHeader.isValid()) {
207                                record.payload.setPayloadHeaderWrapped(record.httpHeader);
208                            } else {
209                                record.diagnostics.addError(
210                                        new Diagnosis(DiagnosisType.ERROR,
211                                                "http header",
212                                                "Unable to parse http header!"));
213                            }
214                        }
215                    }
216                }
217            }
218            // Preliminary compliance status, will be updated when the
219            // payload/record is closed.
220            if (record.diagnostics.hasErrors() || record.diagnostics.hasWarnings()) {
221                record.bIsCompliant = false;
222            } else {
223                record.bIsCompliant = true;
224            }
225            reader.bIsCompliant &= record.bIsCompliant;
226        } else {
227            // In case no record is found the errors/warnings in the record
228            // object are transfered to the Reader.
229            reader.diagnostics.addAll(record.diagnostics);
230            if (record.diagnostics.hasErrors() || record.diagnostics.hasWarnings()) {
231                reader.errors += record.diagnostics.getErrors().size();
232                reader.warnings += record.diagnostics.getWarnings().size();
233                reader.bIsCompliant = false;
234            }
235            // Require one or more records to be present.
236            if (reader.records == 0) {
237                reader.diagnostics.addError(new Diagnosis(DiagnosisType.ERROR_EXPECTED, "WARC file", "One or more records"));
238                ++reader.errors;
239                reader.bIsCompliant = false;
240            }
241            // EOF
242            record = null;
243        }
244        return record;
245    }
246
247    /**
248     * Called when the payload object is closed and final steps in the
249     * validation process can be performed.
250     * @throws IOException i/o exception in final validation processing
251     */
252    @Override
253    public void payloadClosed() throws IOException {
254        if (!bPayloadClosed) {
255            if (payload != null) {
256                // Check for truncated payload.
257                if (payload.getUnavailable() > 0) {
258                    // Payload length mismatch - Payload truncated
259                    addErrorDiagnosis(DiagnosisType.INVALID_DATA, "Payload length mismatch", "Payload truncated");
260                }
261                /*
262                 * Check block digest.
263                 */
264                byte[] digest = payload.getDigest();
265                // Check for computed block digest.
266                if (digest != null) {
267                    computedBlockDigest = new WarcDigest();
268                    computedBlockDigest.digestBytes = digest;
269                }
270                // Auto detect encoding used in WARC header.
271                if (header.warcBlockDigest != null && header.warcBlockDigest.digestString != null) {
272                    isValidBlockDigest = processWarcDigest(header.warcBlockDigest, computedBlockDigest, "block");
273                }
274                // Adjust information about computed block digest.
275                if (computedBlockDigest != null) {
276                    processComputedDigest(computedBlockDigest,
277                            reader.blockDigestAlgorithm, reader.blockDigestEncoding, "block");
278                }
279                if (httpHeader != null && httpHeader.isValid()) {
280                    /*
281                     * Check payload digest.
282                     */
283                    digest = httpHeader.getDigest();
284                    // Check for computed payload digest.
285                    if (digest != null) {
286                        computedPayloadDigest = new WarcDigest();
287                        computedPayloadDigest.digestBytes = digest;
288                    }
289                    // Auto detect encoding used in WARC header.
290                    if (header.warcPayloadDigest != null && header.warcPayloadDigest.digestString != null ) {
291                        isValidPayloadDigest = processWarcDigest(header.warcPayloadDigest, computedPayloadDigest, "payload");
292                    }
293                    // Adjust information about computed payload digest.
294                    if (computedPayloadDigest != null) {
295                        processComputedDigest(computedPayloadDigest,
296                                reader.payloadDigestAlgorithm, reader.payloadDigestEncoding, "payload");
297                    }
298                }
299            }
300            // Check for trailing newlines.
301            trailingNewlines = nlp.parseCRLFs(in, diagnostics);
302            if (trailingNewlines != WarcConstants.WARC_RECORD_TRAILING_NEWLINES) {
303                addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
304                        "Trailing newlines",
305                        Integer.toString(trailingNewlines),
306                        Integer.toString(WarcConstants.WARC_RECORD_TRAILING_NEWLINES));
307            }
308            // isCompliant status update.
309            if (diagnostics.hasErrors() || diagnostics.hasWarnings()) {
310                bIsCompliant = false;
311                reader.errors += diagnostics.getErrors().size();
312                reader.warnings += diagnostics.getWarnings().size();
313            } else {
314                bIsCompliant = true;
315            }
316            reader.bIsCompliant &= bIsCompliant;
317            // Updated consumed after payload has been consumed.
318            consumed = in.getConsumed() - startOffset;
319            // Don't not close payload again.
320            bPayloadClosed = true;
321            // Callback.
322            reader.recordClosed();
323        }
324    }
325
326    /**
327     * Auto-detect encoding used in WARC digest header and compare it to the
328     * internal one, if it has been computed.
329     * @param warcDigest digest from WARC header
330     * @param computedDigest internally compute digest
331     * @param digestName used to identify the digest ("block" or "payload")
332     * @return WARC digest validity indication
333     */
334    protected Boolean processWarcDigest(WarcDigest warcDigest, WarcDigest computedDigest, String digestName) {
335        byte[] digest;
336        Boolean isValidDigest = null;
337        int digestAlgorithmLength = WarcDigest.digestAlgorithmLength(warcDigest.algorithm);
338        digest = Base16.decodeToArray(warcDigest.digestString);
339        if (digest != null && digest.length == digestAlgorithmLength) {
340            warcDigest.digestBytes = digest;
341            warcDigest.encoding = "base16";
342        }
343        if (warcDigest.digestBytes == null) {
344            digest = Base32.decodeToArray(warcDigest.digestString, true);
345            if (digest != null && digest.length == digestAlgorithmLength) {
346                warcDigest.digestBytes = digest;
347                warcDigest.encoding = "base32";
348            }
349            if (warcDigest.digestBytes == null) {
350                digest = Base64.decodeToArray(warcDigest.digestString, true);
351                if (digest != null && digest.length == digestAlgorithmLength) {
352                    warcDigest.digestBytes = digest;
353                    warcDigest.encoding = "base64";
354                }
355            }
356        }
357        if (warcDigest.encoding == null) {
358            // Encoding - Unrecognized block digest encoding scheme
359            addErrorDiagnosis(DiagnosisType.UNKNOWN,
360                    "Record " + digestName + " digest encoding scheme",
361                    warcDigest.digestString);
362        }
363        if (computedDigest != null) {
364            computedDigest.algorithm = warcDigest.algorithm;
365            computedDigest.encoding = warcDigest.encoding;
366            if (warcDigest.digestBytes != null) {
367                if (!Arrays.equals(computedDigest.digestBytes, warcDigest.digestBytes)) {
368                    // Block digest - Computed block digest does not match
369                    addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
370                            "Incorrect " + digestName + " digest",
371                            Base16.encodeArray(warcDigest.digestBytes),
372                            Base16.encodeArray(computedDigest.digestBytes));
373                    isValidDigest = false;
374                } else {
375                    isValidDigest = true;
376                }
377            } else {
378                isValidDigest = false;
379            }
380        }
381        return isValidDigest;
382    }
383
384    /**
385     * Adjust algorithm and encoding information about computed block digest.
386     * @param computedDigest internally compute digest
387     * @param digestAlgorithm default algorithm
388     * @param digestEncoding default encoding
389     * @param digestName used to identify the digest ("block" or "payload")
390     */
391    protected void processComputedDigest(WarcDigest computedDigest, String digestAlgorithm, String digestEncoding, String digestName) {
392        if (computedDigest.algorithm == null) {
393            computedDigest.algorithm = digestAlgorithm;
394        }
395        if (computedDigest.encoding == null && digestEncoding != null) {
396            if ("base32".equals(digestEncoding)) {
397                computedDigest.encoding = "base32";
398            } else if ("base64".equals(digestEncoding)) {
399                computedDigest.encoding = "base64";
400            } else if ("base16".equals(digestEncoding)) {
401                computedDigest.encoding = "base16";
402            } else {
403                // Encoding - Unknown block digest encoding scheme ..
404                addErrorDiagnosis(DiagnosisType.UNKNOWN,
405                        "Default " + digestName + " digest encoding scheme",
406                        digestEncoding);
407            }
408        }
409        if (computedDigest.encoding != null) {
410            if ("base32".equals(computedDigest.encoding)) {
411                computedDigest.digestString = Base32.encodeArray(computedDigest.digestBytes);
412            } else if ("base64".equals(computedDigest.encoding)) {
413                computedDigest.digestString = Base64.encodeArray(computedDigest.digestBytes);
414            } else if ("base16".equals(computedDigest.encoding)) {
415                computedDigest.digestString = Base16.encodeArray(computedDigest.digestBytes);
416            }
417        }
418    }
419
420    /**
421     * Check to see if the record has been closed.
422     * @return boolean indicating whether this record is closed or not
423     */
424    public boolean isClosed() {
425        return bClosed;
426    }
427
428    /**
429     * Close resources associated with the WARC record.
430     * Mainly payload stream if any.
431     * @throws IOException if unable to close resources
432     */
433    public void close() throws IOException {
434        if (!bClosed) {
435            // Ensure input stream is at the end of the record payload.
436            if (payload != null) {
437                payload.close();
438            }
439            payloadClosed();
440            reader = null;
441            in = null;
442            bClosed = true;
443        }
444    }
445
446    /**
447     * Returns a boolean indicating the ISO compliance status of this record.
448     * @return a boolean indicating the ISO compliance status of this record
449     */
450    public boolean isCompliant() {
451        return bIsCompliant;
452    }
453
454    /**
455     * Get the record offset relative to the start of the WARC file
456     * <code>InputStream</code>.
457     * @return the record offset relative to the start of the WARC file
458     */
459    public long getStartOffset() {
460        return header.startOffset;
461    }
462
463    /**
464     * Return number of uncompressed bytes consumed validating this record.
465     * @return number of uncompressed bytes consumed validating this record
466     */
467    public long getConsumed() {
468        return consumed;
469    }
470
471    /**
472     * Get a <code>List</code> of all the non-standard WARC headers found
473     * during parsing.
474     * @return <code>List</code> of <code>HeaderLine</code>
475     */
476    public List<HeaderLine> getHeaderList() {
477        return Collections.unmodifiableList(header.headerList);
478    }
479
480    /**
481     * Get a non-standard WARC header or null, if nothing is stored for this
482     * header name.
483     * @param field header name
484     * @return <code>HeaderLine</code> structure or null
485     */
486    public HeaderLine getHeader(String field) {
487        if (field != null && field.length() > 0) {
488            return header.headerMap.get(field.toLowerCase());
489        } else {
490            return null;
491        }
492    }
493
494    /**
495     * Specifies whether this record has a payload or not.
496     * @return true/false whether the ARC record has a payload
497     */
498    public boolean hasPayload() {
499        return (payload != null);
500    }
501
502    /**
503     * Return Payload object.
504     * @return payload or <code>null</code>
505     */
506    public Payload getPayload() {
507        return payload;
508    }
509
510    /**
511     * Payload content <code>InputStream</code> getter.
512     * @return Payload content <code>InputStream</code>
513     */
514    public InputStream getPayloadContent() {
515        return (payload != null) ? payload.getInputStream() : null;
516    }
517
518    /**
519     * Returns the <code>HttpHeader</code> object if identified in the payload,
520     * or null.
521     * @return the <code>HttpHeader</code> object if identified or null
522     */
523    public HttpHeader getHttpHeader() {
524        return httpHeader;
525    }
526
527    /**
528     * Add an error diagnosis of the given type on a specific entity with
529     * optional extra information. The information varies according to the
530     * diagnosis type.
531     * @param type diagnosis type
532     * @param entity entity examined
533     * @param information optional extra information
534     */
535    protected void addErrorDiagnosis(DiagnosisType type, String entity, String... information) {
536        diagnostics.addError(new Diagnosis(type, entity, information));
537    }
538
539}