001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import java.io.BufferedInputStream; 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.jwat.common.ByteCountingPushBackInputStream; 025import org.jwat.gzip.GzipEntry; 026import org.jwat.gzip.GzipReader; 027 028/** 029 * WARC Reader implementation for reading GZip compressed files. 030 * Use WarcReaderFactory to get an instance of this class. 031 * 032 * @author nicl 033 */ 034public class WarcReaderCompressed extends WarcReader { 035 036 /** Buffer size used by <code>PushbackInputStream</code>. */ 037 public static final int PUSHBACK_BUFFER_SIZE = 32; 038 039 /** WARC file <code>InputStream</code>. */ 040 protected GzipReader reader; 041 042 /** Buffer size, if any, to use on GZip entry <code>InputStream</code>. */ 043 protected int bufferSize; 044 045 /** GZip reader used for the current record, if random access methods used. */ 046 protected GzipReader currentReader; 047 048 /** GZip entry for the current record, if random access methods used. */ 049 protected GzipEntry currentEntry; 050 051 /** 052 * This constructor is used to get random access to records. 053 * The records are then accessed using the getNextRecordFrom methods 054 * using a supplied input stream for each record. 055 */ 056 public WarcReaderCompressed() { 057 init(); 058 } 059 060 /** 061 * Construct reader using the supplied input stream. 062 * This method is primarily for sequential access to records. 063 * @param reader GZip reader 064 */ 065 public WarcReaderCompressed(GzipReader reader) { 066 if (reader == null) { 067 throw new IllegalArgumentException( 068 "'reader' is null"); 069 } 070 this.reader = reader; 071 init(); 072 } 073 074 /** 075 * Construct object using supplied <code>GzipInputStream</code>. 076 * This method is primarily for sequential access to records. 077 * @param reader GZip reader 078 * @param buffer_size buffer size used on entries 079 */ 080 public WarcReaderCompressed(GzipReader reader, int buffer_size) { 081 if (reader == null) { 082 throw new IllegalArgumentException( 083 "'reader' is null"); 084 } 085 if (buffer_size <= 0) { 086 throw new IllegalArgumentException( 087 "The 'buffer_size' is less than or equal to zero: " 088 + buffer_size); 089 } 090 this.reader = reader; 091 this.bufferSize = buffer_size; 092 init(); 093 } 094 095 @Override 096 public boolean isCompressed() { 097 return true; 098 } 099 100 @Override 101 public void close() { 102 if (currentRecord != null) { 103 try { 104 currentRecord.close(); 105 } catch (IOException e) { /* ignore */ } 106 currentRecord = null; 107 } 108 if (reader != null) { 109 startOffset = reader.getStartOffset(); 110 consumed = reader.getOffset(); 111 try { 112 reader.close(); 113 } catch (IOException e) { /* ignore */ } 114 reader = null; 115 } 116 } 117 118 @Override 119 protected void recordClosed() { 120 if (currentEntry != null) { 121 try { 122 currentEntry.close(); 123 consumed += currentEntry.consumed; 124 } catch (IOException e) { /* ignore */ } 125 currentEntry = null; 126 } else { 127 throw new IllegalStateException("'currentEntry' is null, this should never happen!"); 128 } 129 } 130 131 /** Cached start offset used after the reader is closed. */ 132 protected long startOffset = -1; 133 134 /** 135 * Get the offset of the current WARC record from the GZip entry or -1 if 136 * no records have been read yet. 137 * @return offset of the current WARC record from the GZip entry or -1 138 */ 139 @Override 140 public long getStartOffset() { 141 if (reader != null) { 142 return reader.getStartOffset(); 143 } else { 144 return startOffset; 145 } 146 } 147 148 /** 149 * Get the current offset in the WARC <code>GzipReader</code>. 150 * @return offset in WARC <code>InputStream</code> 151 */ 152 @Override 153 public long getOffset() { 154 if (reader != null) { 155 return reader.getOffset(); 156 } else { 157 return consumed; 158 } 159 } 160 161 /** Get number of bytes consumed by the WARC <code>GzipReader</code>. 162 * @return number of bytes consumed by the WARC <code>GzipReader</code> 163 */ 164 @Override 165 public long getConsumed() { 166 if (reader != null) { 167 return reader.getOffset(); 168 } else { 169 return consumed; 170 } 171 } 172 173 @Override 174 public WarcRecord getNextRecord() throws IOException { 175 if (currentRecord != null) { 176 currentRecord.close(); 177 } 178 if (reader == null) { 179 throw new IllegalStateException( 180 "This reader has been initialized with an incompatible constructor, 'reader' is null"); 181 } 182 currentRecord = null; 183 currentReader = reader; 184 currentEntry = reader.getNextEntry(); 185 if (currentEntry != null) { 186 ByteCountingPushBackInputStream pbin; 187 if (bufferSize > 0) { 188 pbin = new ByteCountingPushBackInputStream( 189 new BufferedInputStream( 190 currentEntry.getInputStream(), bufferSize), 191 PUSHBACK_BUFFER_SIZE); 192 } 193 else { 194 pbin = new ByteCountingPushBackInputStream( 195 currentEntry.getInputStream(), PUSHBACK_BUFFER_SIZE); 196 } 197 currentRecord = WarcRecord.parseRecord(pbin, this); 198 } 199 if (currentRecord != null) { 200 startOffset = currentEntry.getStartOffset(); 201 currentRecord.header.startOffset = currentEntry.getStartOffset(); 202 } 203 return currentRecord; 204 } 205 206 @Override 207 public WarcRecord getNextRecordFrom(InputStream rin, long offset) 208 throws IOException { 209 if (currentRecord != null) { 210 currentRecord.close(); 211 } 212 if (reader != null) { 213 throw new IllegalStateException( 214 "This reader has been initialized with an incompatible constructor, 'reader' is not null"); 215 } 216 if (rin == null) { 217 throw new IllegalArgumentException( 218 "The inputstream 'rin' is null"); 219 } 220 if (offset < -1) { 221 throw new IllegalArgumentException( 222 "The 'offset' is less than -1: " + offset); 223 } 224 currentRecord = null; 225 currentReader = new GzipReader(rin); 226 currentEntry = currentReader.getNextEntry(); 227 if (currentEntry != null) { 228 ByteCountingPushBackInputStream pbin = 229 new ByteCountingPushBackInputStream( 230 currentEntry.getInputStream(), PUSHBACK_BUFFER_SIZE); 231 currentRecord = WarcRecord.parseRecord(pbin, this); 232 } 233 if (currentRecord != null) { 234 startOffset = offset; 235 currentRecord.header.startOffset = offset; 236 } 237 return currentRecord; 238 } 239 240 @Override 241 public WarcRecord getNextRecordFrom(InputStream rin, long offset, 242 int buffer_size) throws IOException { 243 if (currentRecord != null) { 244 currentRecord.close(); 245 } 246 if (reader != null) { 247 throw new IllegalStateException( 248 "This reader has been initialized with an incompatible constructor, 'reader' is not null"); 249 } 250 if (rin == null) { 251 throw new IllegalArgumentException( 252 "The inputstream 'rin' is null"); 253 } 254 if (offset < -1) { 255 throw new IllegalArgumentException( 256 "The 'offset' is less than -1: " + offset); 257 } 258 if (buffer_size <= 0) { 259 throw new IllegalArgumentException( 260 "The 'buffer_size' is less than or equal to zero: " 261 + buffer_size); 262 } 263 currentRecord = null; 264 currentReader = new GzipReader(rin); 265 currentEntry = currentReader.getNextEntry(); 266 if (currentEntry != null) { 267 ByteCountingPushBackInputStream pbin = 268 new ByteCountingPushBackInputStream( 269 new BufferedInputStream( 270 currentEntry.getInputStream(), buffer_size), 271 PUSHBACK_BUFFER_SIZE); 272 currentRecord = WarcRecord.parseRecord(pbin, this); 273 } 274 if (currentRecord != null) { 275 startOffset = offset; 276 currentRecord.header.startOffset = offset; 277 } 278 return currentRecord; 279 } 280 281}