package org.apache.nutch.crawl;

import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.indexer.solr.SolrDeleteDuplicates;
import org.apache.nutch.indexer.solr.SolrIndexer;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Crawl.class */
public class Crawl extends Configured implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(Crawl.class);

    private static String getDate() {
        return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System.currentTimeMillis()));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new Crawl(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length < 1) {
            System.out.println("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
            return -1;
        }
        Path path = null;
        Path path2 = new Path("crawl-" + getDate());
        int i = getConf().getInt("fetcher.threads.fetch", 10);
        int i2 = 5;
        long j = Long.MAX_VALUE;
        String str = null;
        int i3 = 0;
        while (i3 < strArr.length) {
            if ("-dir".equals(strArr[i3])) {
                path2 = new Path(strArr[i3 + 1]);
                i3++;
            } else if ("-threads".equals(strArr[i3])) {
                i = Integer.parseInt(strArr[i3 + 1]);
                i3++;
            } else if ("-depth".equals(strArr[i3])) {
                i2 = Integer.parseInt(strArr[i3 + 1]);
                i3++;
            } else if ("-topN".equals(strArr[i3])) {
                j = Integer.parseInt(strArr[i3 + 1]);
                i3++;
            } else if ("-solr".equals(strArr[i3])) {
                str = strArr[i3 + 1];
                i3++;
            } else if (strArr[i3] != null) {
                path = new Path(strArr[i3]);
            }
            i3++;
        }
        NutchJob nutchJob = new NutchJob(getConf());
        if (str == null) {
            LOG.warn("solrUrl is not set, indexing will be skipped...");
        }
        FileSystem fileSystem = FileSystem.get(nutchJob);
        if (LOG.isInfoEnabled()) {
            LOG.info("crawl started in: " + path2);
            LOG.info("rootUrlDir = " + path);
            LOG.info("threads = " + i);
            LOG.info("depth = " + i2);
            LOG.info("solrUrl=" + str);
            if (j != FSConstants.QUOTA_DONT_SET) {
                LOG.info("topN = " + j);
            }
        }
        Path path3 = new Path(path2 + "/crawldb");
        Path path4 = new Path(path2 + "/linkdb");
        Path path5 = new Path(path2 + "/segments");
        new Path(path2 + "/indexes");
        new Path(path2 + "/index");
        nutchJob.getLocalPath("crawl/" + getDate());
        Injector injector = new Injector(getConf());
        Generator generator = new Generator(getConf());
        Fetcher fetcher = new Fetcher(getConf());
        ParseSegment parseSegment = new ParseSegment(getConf());
        CrawlDb crawlDb = new CrawlDb(getConf());
        LinkDb linkDb = new LinkDb(getConf());
        injector.inject(path3, path);
        int i4 = 0;
        while (true) {
            if (i4 >= i2) {
                break;
            }
            Path[] generate = generator.generate(path3, path5, -1, j, System.currentTimeMillis());
            if (generate == null) {
                LOG.info("Stopping at depth=" + i4 + " - no more URLs to fetch.");
                break;
            }
            fetcher.fetch(generate[0], i);
            if (!Fetcher.isParsing(nutchJob)) {
                parseSegment.parse(generate[0]);
            }
            crawlDb.update(path3, generate, true, true);
            i4++;
        }
        if (i4 > 0) {
            linkDb.invert(path4, path5, true, true, false);
            if (str != null) {
                new SolrIndexer(getConf()).indexSolr(str, path3, path4, Arrays.asList(HadoopFSUtil.getPaths(fileSystem.listStatus(path5, HadoopFSUtil.getPassDirectoriesFilter(fileSystem)))));
                SolrDeleteDuplicates solrDeleteDuplicates = new SolrDeleteDuplicates();
                solrDeleteDuplicates.setConf(getConf());
                solrDeleteDuplicates.dedup(str);
            }
        } else {
            LOG.warn("No URLs to fetch - check your seed list and URL filters.");
        }
        if (!LOG.isInfoEnabled()) {
            return 0;
        }
        LOG.info("crawl finished: " + path2);
        return 0;
    }
}
