package org.apache.nutch.parse;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/parse/ParseSegment.class */
public class ParseSegment extends Configured implements Tool, Mapper<WritableComparable, Content, Text, ParseImpl>, Reducer<Text, Writable, Text, Writable> {
    public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
    public static final String SKIP_TRUNCATED = "parser.skip.truncated";
    private ScoringFilters scfilters;
    private boolean skipTruncated;
    private Text newKey;

    public ParseSegment() {
        this(null);
    }

    public ParseSegment(Configuration configuration) {
        super(configuration);
        this.newKey = new Text();
    }

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        setConf(jobConf);
        this.scfilters = new ScoringFilters(jobConf);
        this.skipTruncated = jobConf.getBoolean(SKIP_TRUNCATED, true);
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() {
    }

    @Override // org.apache.hadoop.mapred.Mapper
    public void map(WritableComparable writableComparable, Content content, OutputCollector<Text, ParseImpl> outputCollector, Reporter reporter) throws IOException {
        if (writableComparable instanceof Text) {
            this.newKey.set(writableComparable.toString());
            writableComparable = this.newKey;
        }
        if (Integer.parseInt(content.getMetadata().get(Nutch.FETCH_STATUS_KEY)) != 33) {
            LOG.debug("Skipping " + writableComparable + " as content is not fetched successfully");
            return;
        }
        if (this.skipTruncated && isTruncated(content)) {
            return;
        }
        try {
            Iterator<Map.Entry<Text, Parse>> it = new ParseUtil(getConf()).parse(content).iterator();
            while (it.hasNext()) {
                Map.Entry<Text, Parse> next = it.next();
                Text key = next.getKey();
                Parse value = next.getValue();
                ParseStatus status = value.getData().getStatus();
                long currentTimeMillis = System.currentTimeMillis();
                reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[status.getMajorCode()], 1L);
                if (!status.isSuccess()) {
                    LOG.warn("Error parsing: " + writableComparable + ": " + status);
                    value = status.getEmptyParse(getConf());
                }
                value.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY));
                value.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(SignatureFactory.getSignature(getConf()).calculate(content, value)));
                try {
                    this.scfilters.passScoreAfterParsing(key, content, value);
                } catch (ScoringFilterException e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Error passing score: " + key + ": " + e.getMessage());
                    }
                }
                LOG.info("Parsed (" + Long.toString(System.currentTimeMillis() - currentTimeMillis) + "ms):" + key);
                outputCollector.collect(key, new ParseImpl(new ParseText(value.getText()), value.getData(), value.isCanonical()));
            }
        } catch (Exception e2) {
            LOG.warn("Error parsing: " + writableComparable + ": " + StringUtils.stringifyException(e2));
        }
    }

    public static boolean isTruncated(Content content) {
        Metadata metadata;
        byte[] content2 = content.getContent();
        if (content2 == null || (metadata = content.getMetadata()) == null) {
            return false;
        }
        String str = metadata.get("Content-Length");
        if (str != null) {
            str = str.trim();
        }
        if (StringUtil.isEmpty(str)) {
            return false;
        }
        String url = content.getUrl();
        try {
            int parseInt = Integer.parseInt(str);
            int length = content2.length;
            if (parseInt > length) {
                LOG.info(url + " skipped. Content of size " + parseInt + " was truncated to " + length);
                return true;
            }
            if (!LOG.isDebugEnabled()) {
                return false;
            }
            LOG.debug(url + " actualSize=" + length + " inHeaderSize=" + parseInt);
            return false;
        } catch (NumberFormatException e) {
            LOG.warn("Wrong contentlength format for " + url, (Throwable) e);
            return false;
        }
    }

    @Override // org.apache.hadoop.mapred.Reducer
    public void reduce(Text text, Iterator<Writable> it, OutputCollector<Text, Writable> outputCollector, Reporter reporter) throws IOException {
        outputCollector.collect(text, it.next());
    }

    public void parse(Path path) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("ParseSegment: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
            LOG.info("ParseSegment: segment: " + path);
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("parse " + path);
        FileInputFormat.addInputPath(nutchJob, new Path(path, "content"));
        nutchJob.set(Nutch.SEGMENT_NAME_KEY, path.getName());
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(ParseSegment.class);
        nutchJob.setReducerClass(ParseSegment.class);
        FileOutputFormat.setOutputPath(nutchJob, path);
        nutchJob.setOutputFormat(ParseOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(ParseImpl.class);
        JobClient.runJob(nutchJob);
        long currentTimeMillis2 = System.currentTimeMillis();
        LOG.info("ParseSegment: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length == 0) {
            System.err.println("Usage: ParseSegment segment [-noFilter] [-noNormalize]");
            System.exit(-1);
        }
        if (strArr.length > 1) {
            for (int i = 1; i < strArr.length; i++) {
                String str = strArr[i];
                if ("-nofilter".equalsIgnoreCase(str)) {
                    getConf().setBoolean("parse.filter.urls", false);
                } else if ("-nonormalize".equalsIgnoreCase(str)) {
                    getConf().setBoolean("parse.normalize.urls", false);
                }
            }
        }
        parse(new Path(strArr[0]));
        return 0;
    }
}
