001/**
002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
003 * and GZip files. (http://jwat.org/)
004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.jwat.warc;
019
020import java.io.BufferedInputStream;
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.jwat.common.ByteCountingPushBackInputStream;
025import org.jwat.gzip.GzipEntry;
026import org.jwat.gzip.GzipReader;
027
028/**
029 * WARC Reader implementation for reading GZip compressed files.
030 * Use WarcReaderFactory to get an instance of this class.
031 *
032 * @author nicl
033 */
034public class WarcReaderCompressed extends WarcReader {
035
036    /** Buffer size used by <code>PushbackInputStream</code>. */
037    public static final int PUSHBACK_BUFFER_SIZE = 32;
038
039    /** WARC file <code>InputStream</code>. */
040    protected GzipReader reader;
041
042    /** Buffer size, if any, to use on GZip entry <code>InputStream</code>. */
043    protected int bufferSize;
044
045    /** GZip reader used for the current record, if random access methods used. */
046    protected GzipReader currentReader;
047
048    /** GZip entry for the current record, if random access methods used. */
049    protected GzipEntry currentEntry;
050
051    /**
052     * This constructor is used to get random access to records.
053     * The records are then accessed using the getNextRecordFrom methods
054     * using a supplied input stream for each record.
055     */
056    public WarcReaderCompressed() {
057        init();
058    }
059
060    /**
061     * Construct reader using the supplied input stream.
062     * This method is primarily for sequential access to records.
063     * @param reader GZip reader
064     */
065    public WarcReaderCompressed(GzipReader reader) {
066        if (reader == null) {
067            throw new IllegalArgumentException(
068                    "'reader' is null");
069        }
070        this.reader = reader;
071        init();
072    }
073
074    /**
075     * Construct object using supplied <code>GzipInputStream</code>.
076     * This method is primarily for sequential access to records.
077     * @param reader GZip reader
078     * @param buffer_size buffer size used on entries
079     */
080    public WarcReaderCompressed(GzipReader reader, int buffer_size) {
081        if (reader == null) {
082            throw new IllegalArgumentException(
083                    "'reader' is null");
084        }
085        if (buffer_size <= 0) {
086            throw new IllegalArgumentException(
087                    "The 'buffer_size' is less than or equal to zero: "
088                    + buffer_size);
089        }
090        this.reader = reader;
091        this.bufferSize = buffer_size;
092        init();
093    }
094
095    @Override
096    public boolean isCompressed() {
097        return true;
098    }
099
100    @Override
101    public void close() {
102        if (currentRecord != null) {
103            try {
104                currentRecord.close();
105            } catch (IOException e) { /* ignore */ }
106            currentRecord = null;
107        }
108        if (reader != null) {
109            startOffset = reader.getStartOffset();
110            consumed = reader.getOffset();
111            try {
112                reader.close();
113            } catch (IOException e) { /* ignore */ }
114            reader = null;
115        }
116    }
117
118    @Override
119    protected void recordClosed() {
120        if (currentEntry != null) {
121            try {
122                currentEntry.close();
123                consumed += currentEntry.consumed;
124            } catch (IOException e) { /* ignore */ }
125            currentEntry = null;
126        } else {
127            throw new IllegalStateException("'currentEntry' is null, this should never happen!");
128        }
129    }
130
131    /** Cached start offset used after the reader is closed. */
132    protected long startOffset = -1;
133
134    /**
135     * Get the offset of the current WARC record from the GZip entry or -1 if
136     * no records have been read yet.
137     * @return offset of the current WARC record from the GZip entry or -1
138     */
139    @Override
140    public long getStartOffset() {
141        if (reader != null) {
142            return reader.getStartOffset();
143        } else {
144            return startOffset;
145        }
146    }
147
148    /**
149     * Get the current offset in the WARC <code>GzipReader</code>.
150     * @return offset in WARC <code>InputStream</code>
151     */
152    @Override
153    public long getOffset() {
154        if (reader != null) {
155            return reader.getOffset();
156        } else {
157            return consumed;
158        }
159    }
160
161    /** Get number of bytes consumed by the WARC <code>GzipReader</code>.
162     * @return number of bytes consumed by the WARC <code>GzipReader</code>
163     */
164    @Override
165    public long getConsumed() {
166        if (reader != null) {
167            return reader.getOffset();
168        } else {
169            return consumed;
170        }
171    }
172
173    @Override
174    public WarcRecord getNextRecord() throws IOException {
175        if (currentRecord != null) {
176            currentRecord.close();
177        }
178        if (reader == null) {
179            throw new IllegalStateException(
180                    "This reader has been initialized with an incompatible constructor, 'reader' is null");
181        }
182        currentRecord = null;
183        currentReader = reader;
184        currentEntry = reader.getNextEntry();
185        if (currentEntry != null) {
186            ByteCountingPushBackInputStream pbin;
187            if (bufferSize > 0) {
188                pbin = new ByteCountingPushBackInputStream(
189                        new BufferedInputStream(
190                                currentEntry.getInputStream(), bufferSize),
191                                PUSHBACK_BUFFER_SIZE);
192            }
193            else {
194                pbin = new ByteCountingPushBackInputStream(
195                        currentEntry.getInputStream(), PUSHBACK_BUFFER_SIZE);
196            }
197            currentRecord = WarcRecord.parseRecord(pbin, this);
198        }
199        if (currentRecord != null) {
200            startOffset = currentEntry.getStartOffset();
201            currentRecord.header.startOffset = currentEntry.getStartOffset();
202        }
203        return currentRecord;
204    }
205
206    @Override
207    public WarcRecord getNextRecordFrom(InputStream rin, long offset)
208                                                        throws IOException {
209        if (currentRecord != null) {
210            currentRecord.close();
211        }
212        if (reader != null) {
213            throw new IllegalStateException(
214                    "This reader has been initialized with an incompatible constructor, 'reader' is not null");
215        }
216        if (rin == null) {
217            throw new IllegalArgumentException(
218                    "The inputstream 'rin' is null");
219        }
220        if (offset < -1) {
221            throw new IllegalArgumentException(
222                    "The 'offset' is less than -1: " + offset);
223        }
224        currentRecord = null;
225        currentReader = new GzipReader(rin);
226        currentEntry = currentReader.getNextEntry();
227        if (currentEntry != null) {
228            ByteCountingPushBackInputStream pbin =
229                    new ByteCountingPushBackInputStream(
230                            currentEntry.getInputStream(), PUSHBACK_BUFFER_SIZE);
231            currentRecord = WarcRecord.parseRecord(pbin, this);
232        }
233        if (currentRecord != null) {
234            startOffset = offset;
235            currentRecord.header.startOffset = offset;
236        }
237        return currentRecord;
238    }
239
240    @Override
241    public WarcRecord getNextRecordFrom(InputStream rin, long offset,
242                                        int buffer_size) throws IOException {
243        if (currentRecord != null) {
244            currentRecord.close();
245        }
246        if (reader != null) {
247            throw new IllegalStateException(
248                    "This reader has been initialized with an incompatible constructor, 'reader' is not null");
249        }
250        if (rin == null) {
251            throw new IllegalArgumentException(
252                    "The inputstream 'rin' is null");
253        }
254        if (offset < -1) {
255            throw new IllegalArgumentException(
256                    "The 'offset' is less than -1: " + offset);
257        }
258        if (buffer_size <= 0) {
259            throw new IllegalArgumentException(
260                    "The 'buffer_size' is less than or equal to zero: "
261                    + buffer_size);
262        }
263        currentRecord = null;
264        currentReader = new GzipReader(rin);
265        currentEntry = currentReader.getNextEntry();
266        if (currentEntry != null) {
267            ByteCountingPushBackInputStream pbin =
268                    new ByteCountingPushBackInputStream(
269                            new BufferedInputStream(
270                                    currentEntry.getInputStream(), buffer_size),
271                                    PUSHBACK_BUFFER_SIZE);
272            currentRecord = WarcRecord.parseRecord(pbin, this);
273        }
274        if (currentRecord != null) {
275            startOffset = offset;
276            currentRecord.header.startOffset = offset;
277        }
278        return currentRecord;
279    }
280
281}