001/**
002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
003 * and GZip files. (http://jwat.org/)
004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.jwat.warc;
019
020import java.io.File;
021import java.io.IOException;
022import java.io.RandomAccessFile;
023import java.util.UUID;
024
025import org.jwat.common.RandomAccessFileOutputStream;
026import org.jwat.common.Uri;
027
028/**
029 * Simple WARC file writer wrapping some of the trivial code related to writing records.
030 * Handles automatic max file size closing and renaming of old file and opening of new file.
031 * The preferred workflow is to class nextWriter() and close(). Using open() does not expose a clean internal state.
032 *
033 * @author nicl
034 */
035public class WarcFileWriter {
036
037    /** Suffix used for open files. */
038    public static final String ACTIVE_SUFFIX = ".open";
039
040    /** Overall WARC file writer configuration. */
041    protected WarcFileWriterConfig warcFileConfig;
042
043    /** WARC file naming Configuration. */
044    protected WarcFileNaming warcFileNaming;
045
046    /** Current sequence number. */
047    protected int sequenceNr = -1;
048
049    /** Current WARC file. */
050    protected File writerFile;
051
052    /** Current random access file. */
053    protected RandomAccessFile writer_raf;
054
055    /** Current random access output stream. */
056    protected RandomAccessFileOutputStream writer_rafout;
057
058    /** Current WARC writer. */
059    public WarcWriter writer;
060
061    /** Generated WARC-Info-Record-ID for the current file. */
062    public Uri warcinfoRecordId;
063
064    /**
065     * Constructor for internal and unit test use.
066     */
067    protected WarcFileWriter() {
068    }
069
070    /**
071     * Returns a configured WARC file writer.
072     * @param warcFileNaming WARC file naming configuration
073     * @param warcFileConfig overall WARC writer configuration
074     * @return WARC file writer instance using the supplied configuration
075     */
076    public static WarcFileWriter getWarcWriterInstance(WarcFileNaming warcFileNaming, WarcFileWriterConfig warcFileConfig) {
077        WarcFileWriter wfw = new WarcFileWriter();
078        wfw.warcFileNaming = warcFileNaming;
079        wfw.warcFileConfig = warcFileConfig;
080        /*
081        StringBuilder sb = new StringBuilder();
082        sb.append("software");
083        sb.append(": ");
084        sb.append("Netarchivesuite");
085        sb.append("\r\n");
086        sb.append("format");
087        sb.append(": ");
088        sb.append("WARC file version 1.0");
089        sb.append("\r\n");
090        sb.append("conformsTo");
091        sb.append(": ");
092        sb.append("http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
093        sb.append("\r\n");
094        wfw.warcFields = sb.toString();
095        */
096        return wfw;
097    }
098
099    /**
100     * Returns the current sequence number.
101     * @return the current sequence number
102     */
103    public int getSequenceNr() {
104        return sequenceNr;
105    }
106
107    /**
108     * Returns the current EARC file object.
109     * @return current EARC file object
110     */
111    public File getFile() {
112        return writerFile;
113    }
114
115    /**
116     * Returns the current WARC writer object.
117     * @return current WARC writer object
118     */
119    public WarcWriter getWriter() {
120        return writer;
121    }
122
123    /**
124     * Open new file with active prefix and prepare for writing.
125     * @throws IOException if an I/O exception occurs while opening file
126     */
127    public void open() throws IOException {
128        if (writer == null) {
129            ++sequenceNr;
130            String finishedFilename = warcFileNaming.getFilename(sequenceNr, warcFileConfig.bCompression);
131            String activeFilename = finishedFilename + ACTIVE_SUFFIX;
132            File finishedFile = new File(warcFileConfig.targetDir, finishedFilename);
133            writerFile = new File(warcFileConfig.targetDir, activeFilename);
134            if (writerFile.exists()) {
135                if (warcFileConfig.bOverwrite) {
136                    writerFile.delete();
137                } else {
138                    throw new IOException("'" + writerFile + "' already exists, will not overwrite");
139                }
140            }
141            if (finishedFile.exists()) {
142                if (warcFileConfig.bOverwrite) {
143                    finishedFile.delete();
144                } else {
145                    throw new IOException("'" + finishedFile + "' already exists, will not overwrite");
146                }
147            }
148            writer_raf = new RandomAccessFile(writerFile, "rw");
149            writer_raf.seek(0L);
150            writer_raf.setLength(0L);
151            writer_rafout = new RandomAccessFileOutputStream(writer_raf);
152            writer = WarcWriterFactory.getWriter(writer_rafout, 8192, warcFileConfig.bCompression);
153        }
154    }
155
156    /**
157     * Checks to see whether a new file needs to be created. Depending on the configuration this also checks if the max file size has been reached and closes/renames the old file and opens a new one.
158     * @return boolean indicating whether new writer/file was created
159     * @throws Exception if an exception occurs
160     */
161    public boolean nextWriter() throws Exception {
162        boolean bNewWriter = false;
163        if (writer_raf == null) {
164            bNewWriter = true;
165        } else if (warcFileNaming.supportMultipleFiles() && writer_raf.length() > warcFileConfig.maxFileSize) {
166            close();
167            bNewWriter = true;
168        }
169        if (bNewWriter) {
170            open();
171            //byte[] warcFieldsBytes = warcFields.getBytes("ISO-8859-1");
172            //ByteArrayInputStream bin = new ByteArrayInputStream(warcFieldsBytes);
173            warcinfoRecordId = new Uri("urn:uuid:" + UUID.randomUUID());
174            /*
175            WarcRecord record = WarcRecord.createRecord(writer);
176            WarcHeader header = record.header;
177            header.warcTypeIdx = WarcConstants.RT_IDX_WARCINFO;
178            header.warcDate = new Date();
179            header.warcFilename = finishedFilename;
180            header.warcRecordIdUri = warcinfoRecordId;
181            header.contentTypeStr = WarcConstants.CT_APP_WARC_FIELDS;
182            header.contentLength = new Long(warcFieldsBytes.length);
183            writer.writeHeader(record);
184            writer.streamPayload(bin);
185            writer.closeRecord();
186            */
187        }
188        return bNewWriter;
189    }
190
191    /**
192     * Close writer and release all resources.
193     * @throws IOException in an I/O exception occurs while closing resources
194     */
195    public void close() throws IOException {
196        if (writer != null) {
197            writer.close();
198            writer = null;
199        }
200        if (writer_rafout != null) {
201            writer_rafout.close();
202            writer_rafout = null;
203        }
204        if (writer_raf != null) {
205            writer_raf.close();
206            writer_raf = null;
207        }
208        warcinfoRecordId = null;
209        if (writerFile != null && writerFile.getName().endsWith(ACTIVE_SUFFIX)) {
210            String finishedName = writerFile.getName().substring(0, writerFile.getName().length() - ACTIVE_SUFFIX.length());
211            File finishedFile = new File(writerFile.getParent(), finishedName);
212            if (finishedFile.exists()) {
213                throw new IOException("unable to rename '" + writerFile + "' to '" + finishedFile + "' - destination file already exists");
214            }
215            boolean success = writerFile.renameTo(finishedFile);
216            if (!success) {
217                throw new IOException("unable to rename '" + writerFile + "' to '" + finishedFile + "' - unknown problem");
218            }
219        }
220        writerFile = null;
221    }
222
223}