package org.apache.nutch.tools;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/tools/CrawlDBScanner.class */
public class CrawlDBScanner extends Configured implements Tool, Mapper<Text, CrawlDatum, Text, CrawlDatum>, Reducer<Text, CrawlDatum, Text, CrawlDatum> {
    public static final Logger LOG = LoggerFactory.getLogger(CrawlDBScanner.class);
    private String regex = null;
    private String status = null;

    public CrawlDBScanner() {
    }

    public CrawlDBScanner(Configuration configuration) {
        setConf(configuration);
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() {
    }

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        this.regex = jobConf.get("CrawlDBScanner.regex");
        this.status = jobConf.get("CrawlDBScanner.status");
    }

    @Override // org.apache.hadoop.mapred.Mapper
    public void map(Text text, CrawlDatum crawlDatum, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
        if ((this.status == null || this.status.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) && text.toString().matches(this.regex)) {
            outputCollector.collect(text, crawlDatum);
        }
    }

    @Override // org.apache.hadoop.mapred.Reducer
    public void reduce(Text text, Iterator<CrawlDatum> it, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
        while (it.hasNext()) {
            outputCollector.collect(text, it.next());
        }
    }

    private void scan(Path path, Path path2, String str, String str2, boolean z) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        LOG.info("CrawlDB scanner: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("Scan : " + path + " for URLS matching : " + str);
        nutchJob.set("CrawlDBScanner.regex", str);
        if (str2 != null) {
            nutchJob.set("CrawlDBScanner.status", str2);
        }
        FileInputFormat.addInputPath(nutchJob, new Path(path, "current"));
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(CrawlDBScanner.class);
        nutchJob.setReducerClass(CrawlDBScanner.class);
        FileOutputFormat.setOutputPath(nutchJob, path2);
        if (z) {
            nutchJob.set("mapred.output.compress", "false");
            nutchJob.setOutputFormat(TextOutputFormat.class);
        } else {
            nutchJob.setOutputFormat(MapFileOutputFormat.class);
        }
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(CrawlDatum.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(CrawlDatum.class);
        try {
            JobClient.runJob(nutchJob);
            long currentTimeMillis2 = System.currentTimeMillis();
            LOG.info("CrawlDb scanner: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
        } catch (IOException e) {
            throw e;
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new CrawlDBScanner(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length < 3) {
            System.err.println("Usage: CrawlDBScanner <crawldb> <output> <regex> [-s <status>] <-text>");
            return -1;
        }
        boolean z = false;
        Path path = new Path(strArr[0]);
        Path path2 = new Path(strArr[1]);
        String str = null;
        int i = 2;
        while (i < strArr.length) {
            if (strArr[i].equals("-text")) {
                z = true;
            } else if (strArr[i].equals("-s")) {
                i++;
                str = strArr[i];
            }
            i++;
        }
        try {
            scan(path, path2, strArr[2], str, z);
            return 0;
        } catch (Exception e) {
            LOG.error("CrawlDBScanner: " + StringUtils.stringifyException(e));
            return -1;
        }
    }
}
