001/**
002 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
003 * and GZip files. (http://jwat.org/)
004 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.jwat.warc;
019
020import org.jwat.common.ContentType;
021import org.jwat.common.Diagnosis;
022import org.jwat.common.DiagnosisType;
023import org.jwat.common.Diagnostics;
024import org.jwat.common.IPAddressParser;
025import org.jwat.common.Uri;
026import org.jwat.common.UriProfile;
027
028import java.net.InetAddress;
029import java.util.Date;
030
031/**
032 * Separate class containing all the different types of field parser.
033 * Including validating parsers for strings, integers, longs,
034 * content-types, IP's, URI's, WARC dates, and WARC digests.
035 *
036 * @author nicl
037 */
038public class WarcFieldParsers {
039
040    /** Diagnostics used to report diagnoses.
041     * Must be set prior to calling the various methods. */
042    protected Diagnostics<Diagnosis> diagnostics;
043
044    /**
045     * Add an error diagnosis on the given entity stating that it is invalid
046     * and something else was expected. The optional information should provide
047     * more details and/or format information.
048     * @param entity entity examined
049     * @param information optional extra information
050     */
051    protected void addInvalidExpectedError(String entity, String... information) {
052        diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_EXPECTED, entity, information));
053    }
054
055    /**
056     * Add a warning diagnosis on the given entity stating that it is empty.
057     * @param entity entity examined
058     */
059    protected void addEmptyWarning(String entity) {
060        diagnostics.addWarning(new Diagnosis(DiagnosisType.EMPTY, entity));
061    }
062
063    /**
064     * Validates that the string is not null.
065     * @param str the value to validate
066     * @param field field name
067     * @return the original value
068     */
069    protected String parseString(String str, String field) {
070        if (((str == null) || (str.trim().length() == 0))) {
071            addEmptyWarning("'" + field + "' field");
072        }
073        return str;
074    }
075
076    /**
077     * Returns an Integer object holding the value of the specified string.
078     * @param intStr the value to parse.
079     * @param field field name
080     * @return an integer object holding the value of the specified string or null,
081     * if unable to parse the value as an integer
082     */
083    protected Integer parseInteger(String intStr, String field) {
084         Integer iVal = null;
085         if (intStr != null && intStr.length() > 0) {
086            try {
087                iVal = Integer.valueOf(intStr);
088            } catch (Exception e) {
089                // Invalid integer value.
090                addInvalidExpectedError("'" + field + "' value",
091                        intStr,
092                        "Numeric format");
093            }
094         } else {
095             // Missing integer value.
096             addEmptyWarning("'" + field + "' field");
097         }
098         return iVal;
099    }
100
101    /**
102     * Returns a Long object holding the value of the specified string.
103     * @param longStr the value to parse.
104     * @param field field name
105     * @return a long object holding the value of the specified string or null,
106     * if unable to parse the value as a Long
107     */
108    protected Long parseLong(String longStr, String field) {
109        Long lVal = null;
110         if (longStr != null && longStr.length() > 0) {
111            try {
112                lVal = Long.valueOf(longStr);
113            } catch (Exception e) {
114                // Invalid long value.
115                addInvalidExpectedError("'" + field + "' value",
116                        longStr,
117                        "Numeric format");
118            }
119         } else {
120             // Missing long value.
121             addEmptyWarning("'" + field + "' field");
122         }
123         return lVal;
124    }
125
126    /**
127     * Parse and validate content-type string with optional parameters.
128     * @param contentTypeStr content-type string to parse
129     * @param field field name
130     * @return content type or null, if unable to extract the
131     * content-type
132     */
133    protected ContentType parseContentType(String contentTypeStr, String field) {
134        ContentType contentType = null;
135        if (contentTypeStr != null && contentTypeStr.length() != 0) {
136            contentType = ContentType.parseContentType(contentTypeStr);
137            if (contentType == null) {
138                // Invalid content-type.
139                addInvalidExpectedError("'" + field + "' value",
140                        contentTypeStr,
141                        WarcConstants.CONTENT_TYPE_FORMAT);
142            }
143        } else {
144            // Missing content-type.
145            addEmptyWarning("'" + field + "' field");
146        }
147        return contentType;
148    }
149
150    /**
151     * Parse and validate an IP address.
152     * @param ipAddress the IP address to parse
153     * @param field field name
154     * @return the IP address or null, if unable to parse the value as an
155     * IP-address
156     */
157    protected InetAddress parseIpAddress(String ipAddress, String field) {
158        InetAddress inetAddr = null;
159        if (ipAddress != null && ipAddress.length() > 0) {
160            inetAddr = IPAddressParser.getAddress(ipAddress);
161            if (inetAddr == null) {
162                // Invalid ip address.
163                addInvalidExpectedError("'" + field + "' value",
164                        ipAddress,
165                        "IPv4 or IPv6 format");
166            }
167        } else {
168            // Missing ip address.
169            addEmptyWarning("'" + field + "' field");
170        }
171        return inetAddr;
172    }
173
174    /**
175     * Returns an URI object holding the value of the specified string.
176     * @param uriStr the URL to parse
177     * @param field field name
178     * @param uriProfile the uri profile
179     * @param bLtGt something
180     * @return an URI object holding the value of the specified string or null,
181     * if unable to parse the value as an URI object
182     */
183    protected Uri parseUri(String uriStr, boolean bLtGt, UriProfile uriProfile, String field) {
184        Uri uri = null;
185        String uriStrClean = uriStr;
186        int ltGtBf = 0;
187        if (uriStrClean != null && uriStrClean.length() != 0) {
188            int fIdx = 0;
189            int tIdx = uriStrClean.length();
190            if (uriStrClean.startsWith("<")) {
191                ltGtBf |= 2;
192                ++fIdx;
193            }
194            if (uriStrClean.endsWith(">")) {
195                ltGtBf |= 1;
196                --tIdx;
197            }
198            if (ltGtBf != 0) {
199                uriStrClean = uriStrClean.substring(fIdx, tIdx);
200            }
201            if (bLtGt) {
202                switch (ltGtBf) {
203                case 2:
204                    addInvalidExpectedError("'" + field + "' value", uriStr, "Missing trailing '>' character");
205                    break;
206                case 1:
207                    addInvalidExpectedError("'" + field + "' value", uriStr, "Missing leading '<' character");
208                    break;
209                case 0:
210                    addInvalidExpectedError("'" + field + "' value", uriStr, "Missing encapsulating '<' and '>' characters");
211                    break;
212                case 3:
213                default:
214                    break;
215                }
216            } else {
217                switch (ltGtBf) {
218                case 2:
219                    addInvalidExpectedError("'" + field + "' value", uriStr, "Unexpected leading '<' character");
220                    break;
221                case 1:
222                    addInvalidExpectedError("'" + field + "' value", uriStr, "Unexpected trailing '>' character");
223                    break;
224                case 3:
225                    addInvalidExpectedError("'" + field + "' value", uriStr, "Unexpected encapsulating '<' and '>' characters");
226                    break;
227                case 0:
228                default:
229                    break;
230                }
231            }
232            try {
233                uri = new Uri(uriStrClean, uriProfile);
234            } catch (Exception e) {
235                // Invalid URI.
236                addInvalidExpectedError("'" + field + "' value",
237                        uriStrClean,
238                        e.getMessage());
239            }
240            if (uri != null) {
241                String scheme = uri.getScheme();
242                if (scheme == null) {
243                    uri = null;
244                    // Relative URI.
245                    addInvalidExpectedError("'" + field + "' value",
246                            uriStrClean,
247                            "Absolute URI");
248                } else {
249                    scheme = scheme.toLowerCase();
250                }
251            }
252        } else {
253            // Missing URI.
254            addEmptyWarning("'" + field + "' field");
255        }
256        return uri;
257    }
258
259    /**
260     * Parses WARC record date.
261     * @param dateStr the date to parse.
262     * @param field field name
263     * @return the formatted date or null, if unable to parse the value as a
264     * WARC record date
265     */
266    protected Date parseDate(String dateStr, String field) {
267        Date date = null;
268        if (dateStr != null && dateStr.length() > 0) {
269                date = WarcDateParser.getDate(dateStr);
270                if (date == null) {
271                    // Invalid date.
272                    addInvalidExpectedError("'" + field + "' value",
273                            dateStr,
274                            WarcConstants.WARC_DATE_FORMAT);
275                }
276        } else {
277            // Missing date.
278            addEmptyWarning("'" + field + "' field");
279        }
280        return date;
281    }
282
283    /**
284     * Parse and validate WARC digest string.
285     * @param labelledDigest WARC digest string to parse
286     * @param field field name
287     * @return digest wrapper object or null, if unable to parse the value as a
288     * WARC Digest
289     */
290    protected WarcDigest parseDigest(String labelledDigest, String field) {
291        WarcDigest digest = null;
292        if (labelledDigest != null && labelledDigest.length() > 0) {
293                digest = WarcDigest.parseWarcDigest(labelledDigest);
294                if (digest == null) {
295                    // Invalid digest.
296                    addInvalidExpectedError("'" + field + "' value",
297                            labelledDigest,
298                            WarcConstants.WARC_DIGEST_FORMAT);
299                }
300        } else {
301            // Missing digest.
302            addEmptyWarning("'" + field + "' field");
303        }
304        return digest;
305    }
306
307}