001/**
002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
003 * and GZip files. (http://jwat.org/)
004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.jwat.warc;
019
020import java.io.BufferedInputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024
025import org.jwat.archive.common.ReaderFactoryAbstract;
026import org.jwat.common.ByteCountingPushBackInputStream;
027import org.jwat.gzip.GzipReader;
028
029/**
030 * Factory used for creating <code>WarcReader</code> instances.
031 * The general <code>getReader</code> methods will auto-detect Gzip'ed data
032 * and return the appropriate <code>WarcReader</code> instances.
033 * The other factory methods can be used to return specific
034 * <code>WarcReader</code> instances for compressed or uncompressed records.
035 * Readers are available for both sequential and random reading of records.
036 * Use of buffered methods and/or buffering speeds up the reader considerably.
037 *
038 * @author nicl
039 */
040public class WarcReaderFactory extends ReaderFactoryAbstract {
041
042    /** Buffer size used by <code>PushbackInputStream</code>. */
043    public static final int PUSHBACK_BUFFER_SIZE = 32;
044
045    /**
046     * Private constructor to enforce factory methods.
047     */
048    protected WarcReaderFactory() {
049    }
050
051    /**
052     * Check head of <code>PushBackInputStream</code> for a WARC file identifier.
053     * The identifier for WARC files is "WARC/" in the beginning.
054     * @param pbin <code>PushBackInputStream</code> with WARC records
055     * @return boolean indicating presence of a WARC file identifier
056     * @throws IOException if an i/o error occurs while examining head of stream
057     */
058    public static boolean isWarcFile(ByteCountingPushBackInputStream pbin) throws IOException {
059        return isWarcRecord(pbin);
060    }
061
062    /**
063     * Check head of <code>PushBackInputStream</code> for a WARC record identifier.
064     * The identifier for WARC records is "WARC/" in the beginning.
065     * @param pbin <code>PushBackInputStream</code> with WARC records
066     * @return boolean indicating presence of a WARC magic number
067     * @throws IOException if an i/o error occurs while examining head of stream
068     */
069    public static boolean isWarcRecord(ByteCountingPushBackInputStream pbin) throws IOException {
070        byte[] streamBytes = new byte[WarcConstants.WARC_MAGIC_HEADER.length()];
071        byte[] warcBytes = WarcConstants.WARC_MAGIC_HEADER.getBytes();
072        // Look for the leading magic bytes in front of every valid WARC record.
073        pbin.peek(streamBytes);
074        return (Arrays.equals(warcBytes, streamBytes));
075    }
076
077    /**
078     * Creates a new <code>WarcReader</code> from an <code>InputStream</code>
079     * wrapped by a <code>BufferedInputStream</code>.
080     * The <code>WarcReader</code> implementation returned is chosen based on
081     * GZip auto detection.
082     * @param in WARC File represented as <code>InputStream</code>
083     * @param buffer_size buffer size to use
084     * @return appropriate <code>WarcReader</code> based on data read from
085     * <code>InputStream</code>
086     * @throws IOException if an i/o exception occurs during initialization
087     */
088    public static WarcReader getReader(InputStream in, int buffer_size)
089                                                        throws IOException {
090        if (in == null) {
091            throw new IllegalArgumentException(
092                    "The inputstream 'in' is null");
093        }
094        if (buffer_size <= 0) {
095            throw new IllegalArgumentException(
096                    "The 'buffer_size' is less than or equal to zero: " +
097                    buffer_size);
098        }
099        ByteCountingPushBackInputStream pbin =
100                new ByteCountingPushBackInputStream(
101                        new BufferedInputStream(in, buffer_size),
102                PUSHBACK_BUFFER_SIZE);
103        if (GzipReader.isGzipped(pbin)) {
104            return new WarcReaderCompressed(new GzipReader(pbin),
105                                            buffer_size);
106        }
107        return new WarcReaderUncompressed(pbin);
108    }
109
110    /**
111     * Creates a new <code>WarcReader</code> from an <code>InputStream</code>.
112     * The <code>WarcReader</code> implementation returned is chosen based on
113     * GZip auto detection.
114     * @param in WARC File represented as <code>InputStream</code>
115     * @return appropriate <code>WarcReader</code> based on data read from
116     * <code>InputStream</code>
117     * @throws IOException if an i/o exception occurs during initialization
118     */
119    public static WarcReader getReader(InputStream in) throws IOException {
120        if (in == null) {
121            throw new IllegalArgumentException(
122                    "The inputstream 'in' is null");
123        }
124        ByteCountingPushBackInputStream pbin =
125                new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
126        if (GzipReader.isGzipped(pbin)) {
127            return new WarcReaderCompressed(new GzipReader(pbin));
128        }
129        return new WarcReaderUncompressed(pbin);
130    }
131
132    /**
133     * Creates a new <code>WarcReader</code> without any associated
134     * <code>InputStream</code> for random access to uncompressed records.
135     * @return <code>WarcReader</code> for uncompressed records read from
136     * <code>InputStream</code>
137     */
138    public static WarcReaderUncompressed getReaderUncompressed() {
139        return new WarcReaderUncompressed();
140    }
141
142    /**
143     * Creates a new <code>WarcReader</code> from an <code>InputStream</code>
144     * primarily for random access to uncompressed records.
145     * @param in WARC File represented as <code>InputStream</code>
146     * @return <code>WarcReader</code> for uncompressed records read from
147     * <code>InputStream</code>
148     * @throws IOException i/o exception while initializing reader
149     */
150    public static WarcReaderUncompressed getReaderUncompressed(InputStream in)
151                                                        throws IOException {
152        if (in == null) {
153            throw new IllegalArgumentException(
154                    "The inputstream 'in' is null");
155        }
156        ByteCountingPushBackInputStream pbin =
157                new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
158        return new WarcReaderUncompressed(pbin);
159    }
160
161    /**
162     * Creates a new <code>WarcReader</code> from an <code>InputStream</code>
163     * wrapped by a <code>BufferedInputStream</code> primarily for random
164     * access to uncompressed records.
165     * @param in WARC File represented as <code>InputStream</code>
166     * @param buffer_size buffer size to use
167     * @return <code>WarcReader</code> for uncompressed records read from
168     * <code>InputStream</code>
169     * @throws IOException i/o exception while initializing reader
170     */
171    public static WarcReaderUncompressed getReaderUncompressed(InputStream in,
172                                        int buffer_size) throws IOException {
173        if (in == null) {
174            throw new IllegalArgumentException(
175                    "The inputstream 'in' is null");
176        }
177        if (buffer_size <= 0) {
178            throw new IllegalArgumentException(
179                    "The 'buffer_size' is less than or equal to zero: " +
180                    buffer_size);
181        }
182        ByteCountingPushBackInputStream pbin =
183                new ByteCountingPushBackInputStream(
184                        new BufferedInputStream(in, buffer_size),
185                PUSHBACK_BUFFER_SIZE);
186        return new WarcReaderUncompressed(pbin);
187    }
188
189    /**
190     * Creates a new <code>WarcReader</code> without any associated
191     * <code>InputStream</code> for random access to GZip compressed records.
192     * @return <code>WarcReader</code> for GZip compressed records read from
193     * <code>InputStream</code>
194     */
195    public static WarcReaderCompressed getReaderCompressed() {
196        return new WarcReaderCompressed();
197    }
198
199    /**
200     * Creates a new <code>WarcReader</code> from an <code>InputStream</code>
201     * primarily for random access to GZip compressed records.
202     * @param in WARC File represented as <code>InputStream</code>
203     * @return <code>WarcReader</code> for GZip compressed records read from
204     * <code>InputStream</code>
205     * @throws IOException i/o exception while initializing reader
206     */
207    public static WarcReaderCompressed getReaderCompressed(InputStream in)
208                                                        throws IOException {
209        if (in == null) {
210            throw new IllegalArgumentException(
211                    "The inputstream 'in' is null");
212        }
213        return new WarcReaderCompressed(new GzipReader(in));
214    }
215
216    /**
217     * Creates a new <code>WarcReader</code> from an <code>InputStream</code>
218     * wrapped by a <code>BufferedInputStream</code> primarily for random
219     * access to GZip compressed records.
220     * @param in WARC File represented as <code>InputStream</code>
221     * @param buffer_size buffer size to use
222     * @return <code>WarcReader</code> for GZip compressed records read from
223     * <code>InputStream</code>
224     * @throws IOException i/o exception while initializing reader
225     */
226    public static WarcReaderCompressed getReaderCompressed(InputStream in,
227                                        int buffer_size) throws IOException {
228        if (in == null) {
229            throw new IllegalArgumentException(
230                    "The inputstream 'in' is null");
231        }
232        if (buffer_size <= 0) {
233            throw new IllegalArgumentException(
234                    "The 'buffer_size' is less than or equal to zero: " +
235                    buffer_size);
236        }
237        return new WarcReaderCompressed(new GzipReader(
238                new BufferedInputStream(in, buffer_size)));
239    }
240
241}