package org.apache.nutch.tools;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.CrawlDbReader;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.pdfbox.pdmodel.documentinterchange.taggedpdf.PDLayoutAttributeObject;
import shaded.org.apache.commons.io.IOUtils;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/tools/Benchmark.class */
public class Benchmark extends Configured implements Tool {
    private static final Log LOG = LogFactory.getLog(Benchmark.class);

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/tools/Benchmark$BenchmarkResults.class */
    public static final class BenchmarkResults {
        Map<String, Map<String, Long>> timings = new HashMap();
        List<String> runs = new ArrayList();
        List<String> stages = new ArrayList();
        int seeds;
        int depth;
        int threads;
        boolean delete;
        long topN;
        long elapsed;
        String plugins;

        public void addTiming(String str, String str2, long j) {
            if (!this.runs.contains(str2)) {
                this.runs.add(str2);
            }
            if (!this.stages.contains(str)) {
                this.stages.add(str);
            }
            Map<String, Long> map = this.timings.get(str);
            if (map == null) {
                map = new HashMap();
                this.timings.put(str, map);
            }
            map.put(str2, Long.valueOf(j));
        }

        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("* Plugins:\t" + this.plugins + IOUtils.LINE_SEPARATOR_UNIX);
            sb.append("* Seeds:\t" + this.seeds + IOUtils.LINE_SEPARATOR_UNIX);
            sb.append("* Depth:\t" + this.depth + IOUtils.LINE_SEPARATOR_UNIX);
            sb.append("* Threads:\t" + this.threads + IOUtils.LINE_SEPARATOR_UNIX);
            sb.append("* TopN:\t" + this.topN + IOUtils.LINE_SEPARATOR_UNIX);
            sb.append("* Delete:\t" + this.delete + IOUtils.LINE_SEPARATOR_UNIX);
            sb.append("* TOTAL ELAPSED:\t" + this.elapsed + IOUtils.LINE_SEPARATOR_UNIX);
            for (String str : this.stages) {
                Map<String, Long> map = this.timings.get(str);
                if (map != null) {
                    sb.append("- stage: " + str + IOUtils.LINE_SEPARATOR_UNIX);
                    for (String str2 : this.runs) {
                        Long l = map.get(str2);
                        if (l != null) {
                            sb.append("\trun " + str2 + "\t" + l + IOUtils.LINE_SEPARATOR_UNIX);
                        }
                    }
                }
            }
            return sb.toString();
        }

        public List<String> getStages() {
            return this.stages;
        }

        public List<String> getRuns() {
            return this.runs;
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new Benchmark(), strArr));
    }

    private static String getDate() {
        return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System.currentTimeMillis()));
    }

    private void createSeeds(FileSystem fileSystem, Path path, int i) throws Exception {
        FSDataOutputStream create = fileSystem.create(new Path(path, "seeds"));
        for (int i2 = 0; i2 < i; i2++) {
            create.write(("http://www.test-" + i2 + ".com/\r\n").getBytes());
        }
        create.flush();
        create.close();
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        String str = "protocol-http|parse-tika|scoring-opic|urlfilter-regex|urlnormalizer-pass";
        int i = 1;
        int i2 = 10;
        int i3 = 10;
        boolean z = true;
        if (strArr.length == 0) {
            System.err.println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-maxPerHost NN] [-plugins <regex>]");
            System.err.println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
            System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
            System.err.println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
            System.err.println("\t-keep\tkeep segment data (default: delete after updatedb)");
            System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
            System.err.println("\tNOTE: if not specified, this is reset to: " + str);
            System.err.println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
            System.err.println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
            return -1;
        }
        int i4 = Integer.MAX_VALUE;
        int i5 = 0;
        while (i5 < strArr.length) {
            if (strArr[i5].equals("-seeds")) {
                i5++;
                i = Integer.parseInt(strArr[i5]);
            } else if (strArr[i5].equals("-threads")) {
                i5++;
                i3 = Integer.parseInt(strArr[i5]);
            } else if (strArr[i5].equals("-depth")) {
                i5++;
                i2 = Integer.parseInt(strArr[i5]);
            } else if (strArr[i5].equals("-keep")) {
                z = false;
            } else if (strArr[i5].equals("-plugins")) {
                i5++;
                str = strArr[i5];
            } else {
                if (!strArr[i5].equalsIgnoreCase("-maxPerHost")) {
                    LOG.fatal("Invalid argument: '" + strArr[i5] + "'");
                    return -1;
                }
                i5++;
                i4 = Integer.parseInt(strArr[i5]);
            }
            i5++;
        }
        System.out.println(benchmark(i, i2, i3, i4, FSConstants.QUOTA_DONT_SET, z, str));
        return 0;
    }

    public BenchmarkResults benchmark(int i, int i2, int i3, int i4, long j, boolean z, String str) throws Exception {
        Configuration conf = getConf();
        conf.set("http.proxy.host", "localhost");
        conf.setInt("http.proxy.port", 8181);
        conf.set("http.agent.name", "test");
        conf.set("http.robots.agents", "test,*");
        if (!str.equals("default")) {
            conf.set("plugin.includes", str);
        }
        conf.setInt(Generator.GENERATOR_MAX_COUNT, i4);
        conf.set(Generator.GENERATOR_COUNT_MODE, Generator.GENERATOR_COUNT_VALUE_HOST);
        NutchJob nutchJob = new NutchJob(getConf());
        FileSystem fileSystem = FileSystem.get(nutchJob);
        Path path = new Path(getConf().get("hadoop.tmp.dir"), "bench-" + System.currentTimeMillis());
        fileSystem.mkdirs(path);
        Path path2 = new Path(path, "seed");
        fileSystem.mkdirs(path2);
        createSeeds(fileSystem, path2, i);
        if (LOG.isInfoEnabled()) {
            LOG.info("crawl started in: " + path);
            LOG.info("rootUrlDir = " + path2);
            LOG.info("threads = " + i3);
            LOG.info("depth = " + i2);
        }
        BenchmarkResults benchmarkResults = new BenchmarkResults();
        benchmarkResults.delete = z;
        benchmarkResults.depth = i2;
        benchmarkResults.plugins = str;
        benchmarkResults.seeds = i;
        benchmarkResults.threads = i3;
        benchmarkResults.topN = j;
        Path path3 = new Path(path + "/crawldb");
        Path path4 = new Path(path + "/linkdb");
        Path path5 = new Path(path + "/segments");
        benchmarkResults.elapsed = System.currentTimeMillis();
        Injector injector = new Injector(getConf());
        Generator generator = new Generator(getConf());
        Fetcher fetcher = new Fetcher(getConf());
        ParseSegment parseSegment = new ParseSegment(getConf());
        CrawlDb crawlDb = new CrawlDb(getConf());
        LinkDb linkDb = new LinkDb(getConf());
        long currentTimeMillis = System.currentTimeMillis();
        injector.inject(path3, path2);
        benchmarkResults.addTiming(URLNormalizers.SCOPE_INJECT, PDLayoutAttributeObject.GLYPH_ORIENTATION_VERTICAL_ZERO_DEGREES, System.currentTimeMillis() - currentTimeMillis);
        int i5 = 0;
        while (true) {
            if (i5 >= i2) {
                break;
            }
            long currentTimeMillis2 = System.currentTimeMillis();
            Path[] generate = generator.generate(path3, path5, -1, j, System.currentTimeMillis());
            benchmarkResults.addTiming("generate", i5 + "", System.currentTimeMillis() - currentTimeMillis2);
            if (generate == null) {
                LOG.info("Stopping at depth=" + i5 + " - no more URLs to fetch.");
                break;
            }
            long currentTimeMillis3 = System.currentTimeMillis();
            fetcher.fetch(generate[0], i3);
            benchmarkResults.addTiming("fetch", i5 + "", System.currentTimeMillis() - currentTimeMillis3);
            if (!Fetcher.isParsing(nutchJob)) {
                long currentTimeMillis4 = System.currentTimeMillis();
                parseSegment.parse(generate[0]);
                benchmarkResults.addTiming("parse", i5 + "", System.currentTimeMillis() - currentTimeMillis4);
            }
            long currentTimeMillis5 = System.currentTimeMillis();
            crawlDb.update(path3, generate, true, true);
            benchmarkResults.addTiming("update", i5 + "", System.currentTimeMillis() - currentTimeMillis5);
            long currentTimeMillis6 = System.currentTimeMillis();
            linkDb.invert(path4, generate, true, true, false);
            benchmarkResults.addTiming("invert", i5 + "", System.currentTimeMillis() - currentTimeMillis6);
            if (z) {
                for (Path path6 : generate) {
                    fileSystem.delete(path6, true);
                }
            }
            i5++;
        }
        if (i5 == 0) {
            LOG.warn("No URLs to fetch - check your seed list and URL filters.");
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("crawl finished: " + path);
        }
        benchmarkResults.elapsed = System.currentTimeMillis() - benchmarkResults.elapsed;
        new CrawlDbReader().processStatJob(path3.toString(), conf, false);
        return benchmarkResults;
    }
}
