001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import java.io.File; 021import java.io.IOException; 022import java.io.RandomAccessFile; 023import java.util.UUID; 024 025import org.jwat.common.RandomAccessFileOutputStream; 026import org.jwat.common.Uri; 027 028/** 029 * Simple WARC file writer wrapping some of the trivial code related to writing records. 030 * Handles automatic max file size closing and renaming of old file and opening of new file. 031 * The preferred workflow is to class nextWriter() and close(). Using open() does not expose a clean internal state. 032 * 033 * @author nicl 034 */ 035public class WarcFileWriter { 036 037 /** Suffix used for open files. */ 038 public static final String ACTIVE_SUFFIX = ".open"; 039 040 /** Overall WARC file writer configuration. */ 041 protected WarcFileWriterConfig warcFileConfig; 042 043 /** WARC file naming Configuration. */ 044 protected WarcFileNaming warcFileNaming; 045 046 /** Current sequence number. */ 047 protected int sequenceNr = -1; 048 049 /** Current WARC file. */ 050 protected File writerFile; 051 052 /** Current random access file. */ 053 protected RandomAccessFile writer_raf; 054 055 /** Current random access output stream. */ 056 protected RandomAccessFileOutputStream writer_rafout; 057 058 /** Current WARC writer. */ 059 public WarcWriter writer; 060 061 /** Generated WARC-Info-Record-ID for the current file. */ 062 public Uri warcinfoRecordId; 063 064 /** 065 * Constructor for internal and unit test use. 066 */ 067 protected WarcFileWriter() { 068 } 069 070 /** 071 * Returns a configured WARC file writer. 072 * @param warcFileNaming WARC file naming configuration 073 * @param warcFileConfig overall WARC writer configuration 074 * @return WARC file writer instance using the supplied configuration 075 */ 076 public static WarcFileWriter getWarcWriterInstance(WarcFileNaming warcFileNaming, WarcFileWriterConfig warcFileConfig) { 077 WarcFileWriter wfw = new WarcFileWriter(); 078 wfw.warcFileNaming = warcFileNaming; 079 wfw.warcFileConfig = warcFileConfig; 080 /* 081 StringBuilder sb = new StringBuilder(); 082 sb.append("software"); 083 sb.append(": "); 084 sb.append("Netarchivesuite"); 085 sb.append("\r\n"); 086 sb.append("format"); 087 sb.append(": "); 088 sb.append("WARC file version 1.0"); 089 sb.append("\r\n"); 090 sb.append("conformsTo"); 091 sb.append(": "); 092 sb.append("http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 093 sb.append("\r\n"); 094 wfw.warcFields = sb.toString(); 095 */ 096 return wfw; 097 } 098 099 /** 100 * Returns the current sequence number. 101 * @return the current sequence number 102 */ 103 public int getSequenceNr() { 104 return sequenceNr; 105 } 106 107 /** 108 * Returns the current EARC file object. 109 * @return current EARC file object 110 */ 111 public File getFile() { 112 return writerFile; 113 } 114 115 /** 116 * Returns the current WARC writer object. 117 * @return current WARC writer object 118 */ 119 public WarcWriter getWriter() { 120 return writer; 121 } 122 123 /** 124 * Open new file with active prefix and prepare for writing. 125 * @throws IOException if an I/O exception occurs while opening file 126 */ 127 public void open() throws IOException { 128 if (writer == null) { 129 ++sequenceNr; 130 String finishedFilename = warcFileNaming.getFilename(sequenceNr, warcFileConfig.bCompression); 131 String activeFilename = finishedFilename + ACTIVE_SUFFIX; 132 File finishedFile = new File(warcFileConfig.targetDir, finishedFilename); 133 writerFile = new File(warcFileConfig.targetDir, activeFilename); 134 if (writerFile.exists()) { 135 if (warcFileConfig.bOverwrite) { 136 writerFile.delete(); 137 } else { 138 throw new IOException("'" + writerFile + "' already exists, will not overwrite"); 139 } 140 } 141 if (finishedFile.exists()) { 142 if (warcFileConfig.bOverwrite) { 143 finishedFile.delete(); 144 } else { 145 throw new IOException("'" + finishedFile + "' already exists, will not overwrite"); 146 } 147 } 148 writer_raf = new RandomAccessFile(writerFile, "rw"); 149 writer_raf.seek(0L); 150 writer_raf.setLength(0L); 151 writer_rafout = new RandomAccessFileOutputStream(writer_raf); 152 writer = WarcWriterFactory.getWriter(writer_rafout, 8192, warcFileConfig.bCompression); 153 } 154 } 155 156 /** 157 * Checks to see whether a new file needs to be created. Depending on the configuration this also checks if the max file size has been reached and closes/renames the old file and opens a new one. 158 * @return boolean indicating whether new writer/file was created 159 * @throws Exception if an exception occurs 160 */ 161 public boolean nextWriter() throws Exception { 162 boolean bNewWriter = false; 163 if (writer_raf == null) { 164 bNewWriter = true; 165 } else if (warcFileNaming.supportMultipleFiles() && writer_raf.length() > warcFileConfig.maxFileSize) { 166 close(); 167 bNewWriter = true; 168 } 169 if (bNewWriter) { 170 open(); 171 //byte[] warcFieldsBytes = warcFields.getBytes("ISO-8859-1"); 172 //ByteArrayInputStream bin = new ByteArrayInputStream(warcFieldsBytes); 173 warcinfoRecordId = new Uri("urn:uuid:" + UUID.randomUUID()); 174 /* 175 WarcRecord record = WarcRecord.createRecord(writer); 176 WarcHeader header = record.header; 177 header.warcTypeIdx = WarcConstants.RT_IDX_WARCINFO; 178 header.warcDate = new Date(); 179 header.warcFilename = finishedFilename; 180 header.warcRecordIdUri = warcinfoRecordId; 181 header.contentTypeStr = WarcConstants.CT_APP_WARC_FIELDS; 182 header.contentLength = new Long(warcFieldsBytes.length); 183 writer.writeHeader(record); 184 writer.streamPayload(bin); 185 writer.closeRecord(); 186 */ 187 } 188 return bNewWriter; 189 } 190 191 /** 192 * Close writer and release all resources. 193 * @throws IOException in an I/O exception occurs while closing resources 194 */ 195 public void close() throws IOException { 196 if (writer != null) { 197 writer.close(); 198 writer = null; 199 } 200 if (writer_rafout != null) { 201 writer_rafout.close(); 202 writer_rafout = null; 203 } 204 if (writer_raf != null) { 205 writer_raf.close(); 206 writer_raf = null; 207 } 208 warcinfoRecordId = null; 209 if (writerFile != null && writerFile.getName().endsWith(ACTIVE_SUFFIX)) { 210 String finishedName = writerFile.getName().substring(0, writerFile.getName().length() - ACTIVE_SUFFIX.length()); 211 File finishedFile = new File(writerFile.getParent(), finishedName); 212 if (finishedFile.exists()) { 213 throw new IOException("unable to rename '" + writerFile + "' to '" + finishedFile + "' - destination file already exists"); 214 } 215 boolean success = writerFile.renameTo(finishedFile); 216 if (!success) { 217 throw new IOException("unable to rename '" + writerFile + "' to '" + finishedFile + "' - unknown problem"); 218 } 219 } 220 writerFile = null; 221 } 222 223}