001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import java.text.DateFormat; 021import java.text.SimpleDateFormat; 022import java.util.Date; 023import java.util.TimeZone; 024 025/** 026 * WARC-Date parser and format validator. The format "yyyy-MM-dd'T'HH:mm:ss'Z'" 027 * is specified in the WARC ISO standard. 028 * 029 * @author lbihanic, selghissassi, nicl 030 */ 031public final class WarcDateParser { 032 033 /** WARC <code>DateFormat</code> as specified in the WARC ISO standard. */ 034 private final DateFormat dateFormat; 035 036 /** Basic <code>DateFormat</code> is not thread safe. */ 037 private static final ThreadLocal<WarcDateParser> DateParserTL = 038 new ThreadLocal<WarcDateParser>() { 039 @Override 040 public WarcDateParser initialValue() { 041 return new WarcDateParser(); 042 } 043 }; 044 045 /** 046 * Creates a new <code>DateParser</code>. 047 */ 048 private WarcDateParser() { 049 dateFormat = new SimpleDateFormat(WarcConstants.WARC_DATE_FORMAT); 050 dateFormat.setLenient(false); 051 dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); 052 } 053 054 /** 055 * Parses a date. 056 * @param dateStr date to parse 057 * @return the formatted date or null if unable to parse date 058 */ 059 private Date parseDate(String dateStr) { 060 Date date = null; 061 try { 062 // We subtract 4 from the format because of the ' characters. 063 // These characters are to specify constants in the format string. 064 if ((dateStr != null) && dateStr.length() 065 == WarcConstants.WARC_DATE_FORMAT.length() - 4) { 066 // Support upper/lower-case. 067 date = dateFormat.parse(dateStr.toUpperCase()); 068 } 069 } catch (Exception e) { /* Ignore */ } 070 return date; 071 } 072 073 /** 074 * Parses the date using the format "yyyy-MM-ddTHH:mm:ssZ". 075 * @param dateStr the date to parse 076 * @return the formatted date or <code>null</code> based on whether the date 077 * to parse is compliant with the format "yyyy-MM-ddTHH:mm:ssZ" or not 078 */ 079 public static Date getDate(String dateStr) { 080 Date date = DateParserTL.get().parseDate(dateStr); 081 boolean isValid = (date == null) ? false 082 : (date.getTime() > 0); 083 return isValid ? date : null; 084 } 085 086 /** 087 * Return a <code>DateFormat</code> object which can be used to string 088 * format WARC dates. 089 * @return <code>DateFormat</code> object which can be used to string 090 * format WARC dates. 091 */ 092 public static DateFormat getDateFormat() { 093 return DateParserTL.get().dateFormat; 094 } 095 096} 097