001/** 002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC 003 * and GZip files. (http://jwat.org/) 004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.jwat.warc; 019 020import org.jwat.common.ContentType; 021import org.jwat.common.Diagnosis; 022import org.jwat.common.DiagnosisType; 023import org.jwat.common.Diagnostics; 024import org.jwat.common.IPAddressParser; 025import org.jwat.common.Uri; 026import org.jwat.common.UriProfile; 027 028import java.net.InetAddress; 029import java.util.Date; 030 031/** 032 * Separate class containing all the different types of field parser. 033 * Including validating parsers for strings, integers, longs, 034 * content-types, IP's, URI's, WARC dates, and WARC digests. 035 * 036 * @author nicl 037 */ 038public class WarcFieldParsers { 039 040 /** Diagnostics used to report diagnoses. 041 * Must be set prior to calling the various methods. */ 042 protected Diagnostics<Diagnosis> diagnostics; 043 044 /** 045 * Add an error diagnosis on the given entity stating that it is invalid 046 * and something else was expected. The optional information should provide 047 * more details and/or format information. 048 * @param entity entity examined 049 * @param information optional extra information 050 */ 051 protected void addInvalidExpectedError(String entity, String... information) { 052 diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_EXPECTED, entity, information)); 053 } 054 055 /** 056 * Add a warning diagnosis on the given entity stating that it is empty. 057 * @param entity entity examined 058 */ 059 protected void addEmptyWarning(String entity) { 060 diagnostics.addWarning(new Diagnosis(DiagnosisType.EMPTY, entity)); 061 } 062 063 /** 064 * Validates that the string is not null. 065 * @param str the value to validate 066 * @param field field name 067 * @return the original value 068 */ 069 protected String parseString(String str, String field) { 070 if (((str == null) || (str.trim().length() == 0))) { 071 addEmptyWarning("'" + field + "' field"); 072 } 073 return str; 074 } 075 076 /** 077 * Returns an Integer object holding the value of the specified string. 078 * @param intStr the value to parse. 079 * @param field field name 080 * @return an integer object holding the value of the specified string or null, 081 * if unable to parse the value as an integer 082 */ 083 protected Integer parseInteger(String intStr, String field) { 084 Integer iVal = null; 085 if (intStr != null && intStr.length() > 0) { 086 try { 087 iVal = Integer.valueOf(intStr); 088 } catch (Exception e) { 089 // Invalid integer value. 090 addInvalidExpectedError("'" + field + "' value", 091 intStr, 092 "Numeric format"); 093 } 094 } else { 095 // Missing integer value. 096 addEmptyWarning("'" + field + "' field"); 097 } 098 return iVal; 099 } 100 101 /** 102 * Returns a Long object holding the value of the specified string. 103 * @param longStr the value to parse. 104 * @param field field name 105 * @return a long object holding the value of the specified string or null, 106 * if unable to parse the value as a Long 107 */ 108 protected Long parseLong(String longStr, String field) { 109 Long lVal = null; 110 if (longStr != null && longStr.length() > 0) { 111 try { 112 lVal = Long.valueOf(longStr); 113 } catch (Exception e) { 114 // Invalid long value. 115 addInvalidExpectedError("'" + field + "' value", 116 longStr, 117 "Numeric format"); 118 } 119 } else { 120 // Missing long value. 121 addEmptyWarning("'" + field + "' field"); 122 } 123 return lVal; 124 } 125 126 /** 127 * Parse and validate content-type string with optional parameters. 128 * @param contentTypeStr content-type string to parse 129 * @param field field name 130 * @return content type or null, if unable to extract the 131 * content-type 132 */ 133 protected ContentType parseContentType(String contentTypeStr, String field) { 134 ContentType contentType = null; 135 if (contentTypeStr != null && contentTypeStr.length() != 0) { 136 contentType = ContentType.parseContentType(contentTypeStr); 137 if (contentType == null) { 138 // Invalid content-type. 139 addInvalidExpectedError("'" + field + "' value", 140 contentTypeStr, 141 WarcConstants.CONTENT_TYPE_FORMAT); 142 } 143 } else { 144 // Missing content-type. 145 addEmptyWarning("'" + field + "' field"); 146 } 147 return contentType; 148 } 149 150 /** 151 * Parse and validate an IP address. 152 * @param ipAddress the IP address to parse 153 * @param field field name 154 * @return the IP address or null, if unable to parse the value as an 155 * IP-address 156 */ 157 protected InetAddress parseIpAddress(String ipAddress, String field) { 158 InetAddress inetAddr = null; 159 if (ipAddress != null && ipAddress.length() > 0) { 160 inetAddr = IPAddressParser.getAddress(ipAddress); 161 if (inetAddr == null) { 162 // Invalid ip address. 163 addInvalidExpectedError("'" + field + "' value", 164 ipAddress, 165 "IPv4 or IPv6 format"); 166 } 167 } else { 168 // Missing ip address. 169 addEmptyWarning("'" + field + "' field"); 170 } 171 return inetAddr; 172 } 173 174 /** 175 * Returns an URI object holding the value of the specified string. 176 * @param uriStr the URL to parse 177 * @param field field name 178 * @param uriProfile the uri profile 179 * @param bLtGt something 180 * @return an URI object holding the value of the specified string or null, 181 * if unable to parse the value as an URI object 182 */ 183 protected Uri parseUri(String uriStr, boolean bLtGt, UriProfile uriProfile, String field) { 184 Uri uri = null; 185 String uriStrClean = uriStr; 186 int ltGtBf = 0; 187 if (uriStrClean != null && uriStrClean.length() != 0) { 188 int fIdx = 0; 189 int tIdx = uriStrClean.length(); 190 if (uriStrClean.startsWith("<")) { 191 ltGtBf |= 2; 192 ++fIdx; 193 } 194 if (uriStrClean.endsWith(">")) { 195 ltGtBf |= 1; 196 --tIdx; 197 } 198 if (ltGtBf != 0) { 199 uriStrClean = uriStrClean.substring(fIdx, tIdx); 200 } 201 if (bLtGt) { 202 switch (ltGtBf) { 203 case 2: 204 addInvalidExpectedError("'" + field + "' value", uriStr, "Missing trailing '>' character"); 205 break; 206 case 1: 207 addInvalidExpectedError("'" + field + "' value", uriStr, "Missing leading '<' character"); 208 break; 209 case 0: 210 addInvalidExpectedError("'" + field + "' value", uriStr, "Missing encapsulating '<' and '>' characters"); 211 break; 212 case 3: 213 default: 214 break; 215 } 216 } else { 217 switch (ltGtBf) { 218 case 2: 219 addInvalidExpectedError("'" + field + "' value", uriStr, "Unexpected leading '<' character"); 220 break; 221 case 1: 222 addInvalidExpectedError("'" + field + "' value", uriStr, "Unexpected trailing '>' character"); 223 break; 224 case 3: 225 addInvalidExpectedError("'" + field + "' value", uriStr, "Unexpected encapsulating '<' and '>' characters"); 226 break; 227 case 0: 228 default: 229 break; 230 } 231 } 232 try { 233 uri = new Uri(uriStrClean, uriProfile); 234 } catch (Exception e) { 235 // Invalid URI. 236 addInvalidExpectedError("'" + field + "' value", 237 uriStrClean, 238 e.getMessage()); 239 } 240 if (uri != null) { 241 String scheme = uri.getScheme(); 242 if (scheme == null) { 243 uri = null; 244 // Relative URI. 245 addInvalidExpectedError("'" + field + "' value", 246 uriStrClean, 247 "Absolute URI"); 248 } else { 249 scheme = scheme.toLowerCase(); 250 } 251 } 252 } else { 253 // Missing URI. 254 addEmptyWarning("'" + field + "' field"); 255 } 256 return uri; 257 } 258 259 /** 260 * Parses WARC record date. 261 * @param dateStr the date to parse. 262 * @param field field name 263 * @return the formatted date or null, if unable to parse the value as a 264 * WARC record date 265 */ 266 protected Date parseDate(String dateStr, String field) { 267 Date date = null; 268 if (dateStr != null && dateStr.length() > 0) { 269 date = WarcDateParser.getDate(dateStr); 270 if (date == null) { 271 // Invalid date. 272 addInvalidExpectedError("'" + field + "' value", 273 dateStr, 274 WarcConstants.WARC_DATE_FORMAT); 275 } 276 } else { 277 // Missing date. 278 addEmptyWarning("'" + field + "' field"); 279 } 280 return date; 281 } 282 283 /** 284 * Parse and validate WARC digest string. 285 * @param labelledDigest WARC digest string to parse 286 * @param field field name 287 * @return digest wrapper object or null, if unable to parse the value as a 288 * WARC Digest 289 */ 290 protected WarcDigest parseDigest(String labelledDigest, String field) { 291 WarcDigest digest = null; 292 if (labelledDigest != null && labelledDigest.length() > 0) { 293 digest = WarcDigest.parseWarcDigest(labelledDigest); 294 if (digest == null) { 295 // Invalid digest. 296 addInvalidExpectedError("'" + field + "' value", 297 labelledDigest, 298 WarcConstants.WARC_DIGEST_FORMAT); 299 } 300 } else { 301 // Missing digest. 302 addEmptyWarning("'" + field + "' field"); 303 } 304 return digest; 305 } 306 307}