001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import java.io.BufferedInputStream; 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.jwat.common.ByteCountingPushBackInputStream; 025 026/** 027 * WARC Reader implementation for reading uncompressed files. 028 * Use WarcReaderFactory to get an instance of this class. 029 * 030 * @author nicl 031 */ 032public class WarcReaderUncompressed extends WarcReader { 033 034 /** Buffer size used by <code>PushbackInputStream</code>. */ 035 public static final int PUSHBACK_BUFFER_SIZE = 32; 036 037 /** WARC file <code>InputStream</code>. */ 038 protected ByteCountingPushBackInputStream in; 039 040 /** Start offset of current or next valid record. */ 041 protected long startOffset = 0; 042 043 /** 044 * This constructor is used to get random access to records. 045 * The records are then accessed using the getNextRecordFrom methods 046 * using a supplied input stream for each record. 047 */ 048 public WarcReaderUncompressed() { 049 init(); 050 } 051 052 /** 053 * Construct reader using the supplied input stream. 054 * This method is primarily for sequential access to records. 055 * @param in WARC file input stream 056 */ 057 public WarcReaderUncompressed(ByteCountingPushBackInputStream in) { 058 if (in == null) { 059 throw new IllegalArgumentException( 060 "The inputstream 'in' is null"); 061 } 062 this.in = in; 063 init(); 064 } 065 066 @Override 067 public boolean isCompressed() { 068 return false; 069 } 070 071 @Override 072 public void close() { 073 if (currentRecord != null) { 074 try { 075 currentRecord.close(); 076 } catch (IOException e) { /* ignore */ } 077 currentRecord = null; 078 } 079 if (in != null) { 080 consumed = in.getConsumed(); 081 try { 082 in.close(); 083 } catch (IOException e) { /* ignore */ } 084 in = null; 085 } 086 } 087 088 @Override 089 protected void recordClosed() { 090 if (currentRecord != null) { 091 consumed += currentRecord.consumed; 092 } else { 093 throw new IllegalStateException("'currentRecord' is null, this should never happen!"); 094 } 095 } 096 097 @Override 098 public long getStartOffset() { 099 return startOffset; 100 } 101 102 @Override 103 public long getOffset() { 104 if (in != null) { 105 return in.getConsumed(); 106 } else { 107 return consumed; 108 } 109 } 110 111 @Override 112 public long getConsumed() { 113 if (in != null) { 114 return in.getConsumed(); 115 } else { 116 return consumed; 117 } 118 } 119 120 @Override 121 public WarcRecord getNextRecord() throws IOException { 122 if (currentRecord != null) { 123 currentRecord.close(); 124 } 125 if (in == null) { 126 throw new IllegalStateException( 127 "This reader has been initialized with an incompatible constructor, 'in' is null"); 128 } 129 currentRecord = WarcRecord.parseRecord(in, this); 130 if (currentRecord != null) { 131 startOffset = currentRecord.getStartOffset(); 132 } 133 return currentRecord; 134 } 135 136 @Override 137 public WarcRecord getNextRecordFrom(InputStream rin, long offset) 138 throws IOException { 139 if (currentRecord != null) { 140 currentRecord.close(); 141 } 142 if (in != null) { 143 throw new IllegalStateException( 144 "This reader has been initialized with an incompatible constructor, 'in' is not null"); 145 } 146 if (rin == null) { 147 throw new IllegalArgumentException( 148 "The inputstream 'rin' is null"); 149 } 150 if (offset < -1) { 151 throw new IllegalArgumentException( 152 "The 'offset' is less than -1: " + offset); 153 } 154 ByteCountingPushBackInputStream pbin = 155 new ByteCountingPushBackInputStream(rin, PUSHBACK_BUFFER_SIZE); 156 currentRecord = WarcRecord.parseRecord(pbin, this); 157 if (currentRecord != null) { 158 startOffset = offset; 159 currentRecord.header.startOffset = offset; 160 } 161 return currentRecord; 162 } 163 164 @Override 165 public WarcRecord getNextRecordFrom(InputStream rin, long offset, 166 int buffer_size) throws IOException { 167 if (currentRecord != null) { 168 currentRecord.close(); 169 } 170 if (in != null) { 171 throw new IllegalStateException( 172 "This reader has been initialized with an incompatible constructor, 'in' is not null"); 173 } 174 if (rin == null) { 175 throw new IllegalArgumentException( 176 "The inputstream 'rin' is null"); 177 } 178 if (offset < -1) { 179 throw new IllegalArgumentException( 180 "The 'offset' is less than -1: " + offset); 181 } 182 if (buffer_size <= 0) { 183 throw new IllegalArgumentException( 184 "The 'buffer_size' is less than or equal to zero: " 185 + buffer_size); 186 } 187 ByteCountingPushBackInputStream pbin = 188 new ByteCountingPushBackInputStream( 189 new BufferedInputStream(rin, buffer_size), 190 PUSHBACK_BUFFER_SIZE); 191 currentRecord = WarcRecord.parseRecord(pbin, this); 192 if (currentRecord != null) { 193 startOffset = offset; 194 currentRecord.header.startOffset = offset; 195 } 196 return currentRecord; 197 } 198 199}