001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import java.io.BufferedInputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024 025import org.jwat.archive.common.ReaderFactoryAbstract; 026import org.jwat.common.ByteCountingPushBackInputStream; 027import org.jwat.gzip.GzipReader; 028 029/** 030 * Factory used for creating <code>WarcReader</code> instances. 031 * The general <code>getReader</code> methods will auto-detect Gzip'ed data 032 * and return the appropriate <code>WarcReader</code> instances. 033 * The other factory methods can be used to return specific 034 * <code>WarcReader</code> instances for compressed or uncompressed records. 035 * Readers are available for both sequential and random reading of records. 036 * Use of buffered methods and/or buffering speeds up the reader considerably. 037 * 038 * @author nicl 039 */ 040public class WarcReaderFactory extends ReaderFactoryAbstract { 041 042 /** Buffer size used by <code>PushbackInputStream</code>. */ 043 public static final int PUSHBACK_BUFFER_SIZE = 32; 044 045 /** 046 * Private constructor to enforce factory methods. 047 */ 048 protected WarcReaderFactory() { 049 } 050 051 /** 052 * Check head of <code>PushBackInputStream</code> for a WARC file identifier. 053 * The identifier for WARC files is "WARC/" in the beginning. 054 * @param pbin <code>PushBackInputStream</code> with WARC records 055 * @return boolean indicating presence of a WARC file identifier 056 * @throws IOException if an i/o error occurs while examining head of stream 057 */ 058 public static boolean isWarcFile(ByteCountingPushBackInputStream pbin) throws IOException { 059 return isWarcRecord(pbin); 060 } 061 062 /** 063 * Check head of <code>PushBackInputStream</code> for a WARC record identifier. 064 * The identifier for WARC records is "WARC/" in the beginning. 065 * @param pbin <code>PushBackInputStream</code> with WARC records 066 * @return boolean indicating presence of a WARC magic number 067 * @throws IOException if an i/o error occurs while examining head of stream 068 */ 069 public static boolean isWarcRecord(ByteCountingPushBackInputStream pbin) throws IOException { 070 byte[] streamBytes = new byte[WarcConstants.WARC_MAGIC_HEADER.length()]; 071 byte[] warcBytes = WarcConstants.WARC_MAGIC_HEADER.getBytes(); 072 // Look for the leading magic bytes in front of every valid WARC record. 073 pbin.peek(streamBytes); 074 return (Arrays.equals(warcBytes, streamBytes)); 075 } 076 077 /** 078 * Creates a new <code>WarcReader</code> from an <code>InputStream</code> 079 * wrapped by a <code>BufferedInputStream</code>. 080 * The <code>WarcReader</code> implementation returned is chosen based on 081 * GZip auto detection. 082 * @param in WARC File represented as <code>InputStream</code> 083 * @param buffer_size buffer size to use 084 * @return appropriate <code>WarcReader</code> based on data read from 085 * <code>InputStream</code> 086 * @throws IOException if an i/o exception occurs during initialization 087 */ 088 public static WarcReader getReader(InputStream in, int buffer_size) 089 throws IOException { 090 if (in == null) { 091 throw new IllegalArgumentException( 092 "The inputstream 'in' is null"); 093 } 094 if (buffer_size <= 0) { 095 throw new IllegalArgumentException( 096 "The 'buffer_size' is less than or equal to zero: " + 097 buffer_size); 098 } 099 ByteCountingPushBackInputStream pbin = 100 new ByteCountingPushBackInputStream( 101 new BufferedInputStream(in, buffer_size), 102 PUSHBACK_BUFFER_SIZE); 103 if (GzipReader.isGzipped(pbin)) { 104 return new WarcReaderCompressed(new GzipReader(pbin), 105 buffer_size); 106 } 107 return new WarcReaderUncompressed(pbin); 108 } 109 110 /** 111 * Creates a new <code>WarcReader</code> from an <code>InputStream</code>. 112 * The <code>WarcReader</code> implementation returned is chosen based on 113 * GZip auto detection. 114 * @param in WARC File represented as <code>InputStream</code> 115 * @return appropriate <code>WarcReader</code> based on data read from 116 * <code>InputStream</code> 117 * @throws IOException if an i/o exception occurs during initialization 118 */ 119 public static WarcReader getReader(InputStream in) throws IOException { 120 if (in == null) { 121 throw new IllegalArgumentException( 122 "The inputstream 'in' is null"); 123 } 124 ByteCountingPushBackInputStream pbin = 125 new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE); 126 if (GzipReader.isGzipped(pbin)) { 127 return new WarcReaderCompressed(new GzipReader(pbin)); 128 } 129 return new WarcReaderUncompressed(pbin); 130 } 131 132 /** 133 * Creates a new <code>WarcReader</code> without any associated 134 * <code>InputStream</code> for random access to uncompressed records. 135 * @return <code>WarcReader</code> for uncompressed records read from 136 * <code>InputStream</code> 137 */ 138 public static WarcReaderUncompressed getReaderUncompressed() { 139 return new WarcReaderUncompressed(); 140 } 141 142 /** 143 * Creates a new <code>WarcReader</code> from an <code>InputStream</code> 144 * primarily for random access to uncompressed records. 145 * @param in WARC File represented as <code>InputStream</code> 146 * @return <code>WarcReader</code> for uncompressed records read from 147 * <code>InputStream</code> 148 * @throws IOException i/o exception while initializing reader 149 */ 150 public static WarcReaderUncompressed getReaderUncompressed(InputStream in) 151 throws IOException { 152 if (in == null) { 153 throw new IllegalArgumentException( 154 "The inputstream 'in' is null"); 155 } 156 ByteCountingPushBackInputStream pbin = 157 new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE); 158 return new WarcReaderUncompressed(pbin); 159 } 160 161 /** 162 * Creates a new <code>WarcReader</code> from an <code>InputStream</code> 163 * wrapped by a <code>BufferedInputStream</code> primarily for random 164 * access to uncompressed records. 165 * @param in WARC File represented as <code>InputStream</code> 166 * @param buffer_size buffer size to use 167 * @return <code>WarcReader</code> for uncompressed records read from 168 * <code>InputStream</code> 169 * @throws IOException i/o exception while initializing reader 170 */ 171 public static WarcReaderUncompressed getReaderUncompressed(InputStream in, 172 int buffer_size) throws IOException { 173 if (in == null) { 174 throw new IllegalArgumentException( 175 "The inputstream 'in' is null"); 176 } 177 if (buffer_size <= 0) { 178 throw new IllegalArgumentException( 179 "The 'buffer_size' is less than or equal to zero: " + 180 buffer_size); 181 } 182 ByteCountingPushBackInputStream pbin = 183 new ByteCountingPushBackInputStream( 184 new BufferedInputStream(in, buffer_size), 185 PUSHBACK_BUFFER_SIZE); 186 return new WarcReaderUncompressed(pbin); 187 } 188 189 /** 190 * Creates a new <code>WarcReader</code> without any associated 191 * <code>InputStream</code> for random access to GZip compressed records. 192 * @return <code>WarcReader</code> for GZip compressed records read from 193 * <code>InputStream</code> 194 */ 195 public static WarcReaderCompressed getReaderCompressed() { 196 return new WarcReaderCompressed(); 197 } 198 199 /** 200 * Creates a new <code>WarcReader</code> from an <code>InputStream</code> 201 * primarily for random access to GZip compressed records. 202 * @param in WARC File represented as <code>InputStream</code> 203 * @return <code>WarcReader</code> for GZip compressed records read from 204 * <code>InputStream</code> 205 * @throws IOException i/o exception while initializing reader 206 */ 207 public static WarcReaderCompressed getReaderCompressed(InputStream in) 208 throws IOException { 209 if (in == null) { 210 throw new IllegalArgumentException( 211 "The inputstream 'in' is null"); 212 } 213 return new WarcReaderCompressed(new GzipReader(in)); 214 } 215 216 /** 217 * Creates a new <code>WarcReader</code> from an <code>InputStream</code> 218 * wrapped by a <code>BufferedInputStream</code> primarily for random 219 * access to GZip compressed records. 220 * @param in WARC File represented as <code>InputStream</code> 221 * @param buffer_size buffer size to use 222 * @return <code>WarcReader</code> for GZip compressed records read from 223 * <code>InputStream</code> 224 * @throws IOException i/o exception while initializing reader 225 */ 226 public static WarcReaderCompressed getReaderCompressed(InputStream in, 227 int buffer_size) throws IOException { 228 if (in == null) { 229 throw new IllegalArgumentException( 230 "The inputstream 'in' is null"); 231 } 232 if (buffer_size <= 0) { 233 throw new IllegalArgumentException( 234 "The 'buffer_size' is less than or equal to zero: " + 235 buffer_size); 236 } 237 return new WarcReaderCompressed(new GzipReader( 238 new BufferedInputStream(in, buffer_size))); 239 } 240 241}