001/**
002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
003 * and GZip files. (http://jwat.org/)
004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.jwat.warc;
019
020import org.jwat.common.Diagnosis;
021import org.jwat.common.Diagnostics;
022import org.jwat.common.HeaderLineReader;
023import org.jwat.common.UriProfile;
024
025import java.io.Closeable;
026import java.io.IOException;
027import java.io.InputStream;
028import java.util.Iterator;
029import java.util.NoSuchElementException;
030
031/**
032 * Base class for WARC reader implementations.
033 *
034 * @author nicl
035 */
036public abstract class WarcReader implements Closeable {
037
038    /*
039     * Settings.
040     */
041
042    /** WARC-Target-URI profile. */
043    protected UriProfile warcTargetUriProfile;
044
045    /** URI profile. */
046    protected UriProfile uriProfile;
047
048    /** Default block digest algorithm to use if none is present in the
049     *  record. */
050    protected String blockDigestAlgorithm;
051
052    /** Default encoding scheme used to encode block digest into a string,
053     *  if none is detected from the record. */
054    protected String blockDigestEncoding = "base32";
055
056    /** Payload Digest enabled/disabled. */
057    protected boolean bPayloadDigest = false;
058
059    /** Default payload digest algorithm to use if none is present in the
060     *  record. */
061    protected String payloadDigestAlgorithm;
062
063    /** Default encoding scheme used to encode payload digest into a string,
064     *  if none is detected from the record. */
065    protected String payloadDigestEncoding = "base32";
066
067    /** Block Digest enabled/disabled. */
068    protected boolean bBlockDigest = false;
069
070    /** Max size allowed for a record header. */
071    protected int recordHeaderMaxSize;
072
073    /** Max size allowed for a payload header. */
074    protected int payloadHeaderMaxSize;
075
076    /** Line reader used to read version lines. */
077    protected HeaderLineReader lineReader;
078
079    /** Header line reader used to read the WARC headers. */
080    protected HeaderLineReader headerLineReader;
081
082    /** WARC field parser used. */
083    protected WarcFieldParsers fieldParsers;
084
085    /*
086     * State.
087     */
088
089    /** Reader level errors and warnings or when no record is available. */
090    public final Diagnostics<Diagnosis> diagnostics = new Diagnostics<Diagnosis>();
091
092    /** Compliance status for records parsed up to now. */
093    protected boolean bIsCompliant = true;
094
095    /** Number of bytes consumed by this reader. */
096    protected long consumed = 0;
097
098    /** Records parsed. */
099    protected int records = 0;
100
101    /** Aggregated number of errors encountered while parsing. */
102    protected int errors = 0;
103
104    /** Aggregate number of warnings encountered while parsing. */
105    protected int warnings = 0;
106
107    /** Current WARC record object. */
108    protected WarcRecord currentRecord;
109
110    /** Exception thrown while using the iterator. */
111    protected Exception iteratorExceptionThrown;
112
113    /**
114     * Method used to initialize a readers internal state.
115     * Must be called by all constructors.
116     */
117    protected void init() {
118        warcTargetUriProfile = UriProfile.RFC3986;
119        uriProfile = UriProfile.RFC3986;
120        recordHeaderMaxSize = 8192;
121        payloadHeaderMaxSize = 32768;
122        lineReader = HeaderLineReader.getReader();
123        lineReader.bNameValue = false;
124        lineReader.encoding = HeaderLineReader.ENC_US_ASCII;
125        headerLineReader = HeaderLineReader.getReader();
126        headerLineReader.bNameValue = true;
127        headerLineReader.encoding = HeaderLineReader.ENC_UTF8;
128        headerLineReader.bLWS = true;
129        headerLineReader.bQuotedText = true;
130        headerLineReader.bEncodedWords = true;
131        fieldParsers = new WarcFieldParsers();
132    }
133
134    /**
135     * Reset reader for reuse.
136     */
137    public void reset() {
138        diagnostics.reset();
139        bIsCompliant = true;
140        consumed = 0;
141        records = 0;
142        errors = 0;
143        warnings = 0;
144        currentRecord = null;
145    }
146
147    /**
148     * Returns a boolean indicating if all records parsed so far are compliant.
149     * @return a boolean indicating if all records parsed so far are compliant
150     */
151    public boolean isCompliant() {
152        return bIsCompliant;
153    }
154
155    /**
156     * Is this reader assuming GZip compressed input.
157     * @return boolean indicating the assumption of GZip compressed input
158     */
159    public abstract boolean isCompressed();
160
161    /**
162     * Set the URI profile used to validate WARC-Target URIs.
163     * If null, the uriProfile is set to RCF3986.
164     * @param uriProfile URI profile to use
165     */
166    public void setWarcTargetUriProfile(UriProfile uriProfile) {
167        if (uriProfile == null) {
168            uriProfile = UriProfile.RFC3986;
169        }
170        this.warcTargetUriProfile = uriProfile;
171    }
172
173    /**
174     * Get the URI profile used to validate WARC-Target URIs.
175     * @return the URI profile used to validate WARC-Target URIs
176     */
177    public UriProfile getWarcTargetUriProfile() {
178        return warcTargetUriProfile;
179    }
180
181    /**
182     * Set the URI profile used to validate URIs.
183     * If null, the uriProfile is set to RCF3986.
184     * @param uriProfile URI profile to use
185     */
186    public void setUriProfile(UriProfile uriProfile) {
187        if (uriProfile == null) {
188            uriProfile = UriProfile.RFC3986;
189        }
190        this.uriProfile = uriProfile;
191    }
192
193    /**
194     * Get the URI profile used to validate URIs.
195     * @return the URI profile used to validate URIs
196     */
197    public UriProfile getUriProfile() {
198        return uriProfile;
199    }
200
201    /**
202     * Get the readers block digest on/off status.
203     * @return boolean indicating block digest on/off
204     */
205    public boolean getBlockDigestEnabled() {
206        return bBlockDigest;
207    }
208
209    /**
210     * Set the readers block digest on/off status. Digest, however,
211     * will only be computed if either a Warc-Block-Digest header is
212     * present or an optional algorithm has been chosen.
213     * The Warc-Block-Digest always takes precedence.
214     * @param enabled boolean indicating block digest on/off
215     */
216    public void setBlockDigestEnabled(boolean enabled) {
217        bBlockDigest = enabled;
218    }
219
220    /**
221     * Get the readers payload digest on/off status.
222     * @return boolean indicating payload digest on/off
223     */
224    public boolean getPayloadDigestEnabled() {
225        return bPayloadDigest;
226    }
227
228    /**
229     * Set the readers payload digest on/off status. Digest, however,
230     * will only be computed if either a Warc-Payload-Digest header is
231     * present or an optional algorithm has been chosen.
232     * The Warc-Payload-Digest always takes precedence.
233     * @param enabled boolean indicating payload digest on/off
234     */
235    public void setPayloadDigestEnabled(boolean enabled) {
236        bPayloadDigest = enabled;
237    }
238
239    /**
240     * Get the default block digest algorithm.
241     * @return default block digest algorithm
242     */
243    public String getBlockDigestAlgorithm() {
244        return blockDigestAlgorithm;
245    }
246
247    /**
248     * Tries to set the default block digest algorithm and returns a boolean
249     * indicating whether the algorithm was accepted or not. This algorithm is
250     * only used in case no WARC payload digest header is present in the record.
251     * @param digestAlgorithm block digest algorithm
252     * (null means no default block digest algorithm is selected)
253     * @return boolean indicating the validity of the algorithm supplied
254     */
255    public boolean setBlockDigestAlgorithm(String digestAlgorithm) {
256        if (digestAlgorithm == null || digestAlgorithm.length() == 0) {
257            blockDigestAlgorithm = null;
258            return true;
259        }
260        if (WarcDigest.digestAlgorithmLength(digestAlgorithm) > 0) {
261            blockDigestAlgorithm = digestAlgorithm;
262            return true;
263        }
264        return false;
265    }
266
267    /**
268     * Get the default payload digest algorithm.
269     * @return default payload digest algorithm
270     */
271    public String getPayloadDigestAlgorithm() {
272        return payloadDigestAlgorithm;
273    }
274
275    /**
276     * Tries to set the default payload digest algorithm and returns a boolean
277     * indicating whether the algorithm was accepted or not. This algorithm is
278     * only used in case no WARC payload digest header is present in the record.
279     * @param digestAlgorithm payload digest algorithm
280     * (null means no default payload digest algorithm is selected)
281     * @return boolean indicating the validity of the algorithm supplied
282     */
283    public boolean setPayloadDigestAlgorithm(String digestAlgorithm) {
284        if (digestAlgorithm == null || digestAlgorithm.length() == 0) {
285            payloadDigestAlgorithm = null;
286            return true;
287        }
288        if (WarcDigest.digestAlgorithmLength(digestAlgorithm) > 0) {
289            payloadDigestAlgorithm = digestAlgorithm;
290            return true;
291        }
292        return false;
293    }
294
295    /**
296     * Get the default block digest encoding scheme.
297     * @return default block digest encoding scheme
298     */
299    public String getBlockDigestEncoding() {
300        return blockDigestEncoding;
301    }
302
303    /**
304     * Set the default block digest encoding scheme. This scheme is only
305     * used if none can be inferred from an existing block digest header.
306     * @param encodingScheme encoding scheme
307     * (null means default block digest is not encoded)
308     */
309    public void setBlockDigestEncoding(String encodingScheme) {
310        if (encodingScheme != null && encodingScheme.length() > 0) {
311            blockDigestEncoding = encodingScheme.toLowerCase();
312        } else {
313            blockDigestEncoding = null;
314        }
315    }
316
317    /**
318     * Get the default payload digest encoding scheme.
319     * @return default payload digest encoding scheme
320     */
321    public String getPayloadDigestEncoding() {
322        return payloadDigestEncoding;
323    }
324
325    /**
326     * Set the default payload digest encoding scheme. This scheme is only
327     * used if none can be inferred from an existing payload digest header.
328     * @param encodingScheme encoding scheme
329     * (null means default payload digest is not encoded)
330     */
331    public void setPayloadDigestEncoding(String encodingScheme) {
332        if (encodingScheme != null && encodingScheme.length() > 0) {
333            payloadDigestEncoding = encodingScheme.toLowerCase();
334        } else {
335            payloadDigestEncoding = null;
336        }
337    }
338
339    /**
340     * Get the max size allowed for a record header.
341     * @return max size allowed for a record header
342     */
343    public int getRecordHeaderMaxSize() {
344        return recordHeaderMaxSize;
345    }
346
347    /**
348     * Set the max size allowed for a record header.
349     * @param size max size allowed
350     */
351    public void setRecordHeaderMaxSize(int size) {
352        recordHeaderMaxSize = size;
353    }
354
355    /**
356     * Get the max size allowed for a payload header.
357     * @return max size allowed for a payload header
358     */
359    public int getPayloadHeaderMaxSize() {
360        return payloadHeaderMaxSize;
361    }
362
363    /**
364     * Set the max size allowed for a payload header.
365     * @param size max size allowed
366     */
367    public void setPayloadHeaderMaxSize(int size) {
368        payloadHeaderMaxSize = size;
369    }
370
371    /**
372     * Close current record resource(s) and input stream(s).
373     */
374    public abstract void close();
375
376    /**
377     * Callback method called when the payload has been processed.
378     */
379    protected abstract void recordClosed();
380
381    /**
382     * Get the offset of the current WARC record or -1 if none have been read.
383     * @return offset of the current WARC record or -1
384     */
385    public abstract long getStartOffset();
386
387    /**
388     * Get the current offset in the WARC <code>InputStream</code>.
389     * @return offset in WARC <code>InputStream</code>
390     */
391    public abstract long getOffset();
392
393    /**
394     * Get number of bytes consumed by this reader.
395     * @return number of bytes consumed by this reader
396     */
397   public abstract long getConsumed();
398
399   /**
400     * Parses and gets the next record.
401     * This method is for linear access to records.
402     * @return the next record
403     * @throws IOException i/o exception in parsing process
404     */
405    public abstract WarcRecord getNextRecord() throws IOException;
406
407    /**
408     * Parses and gets the next record from an <code>Inputstream</code>.
409     * This method is mainly for random access use since there are serious
410     * side-effects involved in using multiple <code>PushBackInputStream</code>
411     * instances.
412     * @param in <code>InputStream</code> used to read next record
413     * @param offset offset provided by caller
414     * @return the next record
415     * @throws IOException i/o exception in parsing process
416     */
417    public abstract WarcRecord getNextRecordFrom(InputStream in, long offset)
418                                                        throws IOException;
419
420    /**
421     * Parses and gets the next record from an <code>Inputstream</code> wrapped
422     * by a <code>BufferedInputStream</code>.
423     * This method is mainly for random access use since there are serious
424     * side-effects involved in using multiple <code>PushBackInputStream</code>
425     * instances.
426     * @param in <code>InputStream</code> used to read next record
427     * @param offset offset provided by caller
428     * @param buffer_size buffer size to use
429     * @return the next record
430     * @throws IOException i/o exception in parsing process
431     */
432    public abstract WarcRecord getNextRecordFrom(InputStream in, long offset,
433                                        int buffer_size) throws IOException;
434
435    /**
436     * Gets an exception thrown in the iterator if any or null.
437     * @return exception thrown in the iterator if any or null
438     */
439    public Exception getIteratorExceptionThrown() {
440        return iteratorExceptionThrown;
441    }
442
443    /**
444     * Returns an <code>Iterator</code> over the records as they are being
445     * parsed. Any exception thrown during parsing is accessible through the
446     * <code>getIteratorExceptionThrown</code> method.
447     * @return <code>Iterator</code> over the records
448     */
449    public Iterator<WarcRecord> iterator() {
450        return new Iterator<WarcRecord>() {
451
452            /** Internal next record updated by either hasNext() or next(). */
453            private WarcRecord next;
454
455            /** Entry returned by next(). */
456            private WarcRecord current;
457
458            @Override
459            public boolean hasNext() {
460                if (next == null) {
461                    iteratorExceptionThrown = null;
462                    try {
463                        next = getNextRecord();
464                    } catch (IOException e) {
465                        iteratorExceptionThrown = e;
466                    }
467                }
468                return (next != null);
469            }
470
471            @Override
472            public WarcRecord next() {
473                if (next == null) {
474                    iteratorExceptionThrown = null;
475                    try {
476                        next = getNextRecord();
477                    } catch (IOException e) {
478                        iteratorExceptionThrown = e;
479                    }
480                }
481                if (next == null) {
482                    throw new NoSuchElementException();
483                }
484                current = next;
485                next = null;
486                return current;
487            }
488
489            @Override
490            public void remove() {
491                throw new UnsupportedOperationException();
492            }
493        };
494    }
495
496}