package org.apache.nutch.scoring.webgraph;

import com.beust.jcommander.Parameters;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Random;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.scoring.webgraph.Loops;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkRank.class */
public class LinkRank extends Configured implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(LinkRank.class);
    private static final String NUM_NODES = "_num_nodes_";

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkRank$Analyzer.class */
    private static class Analyzer implements Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, Node> {
        private JobConf conf;
        private float dampingFactor = 0.85f;
        private float rankOne = 0.0f;
        private int itNum = 0;
        private boolean limitPages = true;
        private boolean limitDomains = true;

        private Analyzer() {
        }

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            try {
                this.conf = jobConf;
                this.dampingFactor = jobConf.getFloat("link.analyze.damping.factor", 0.85f);
                this.rankOne = jobConf.getFloat("link.analyze.rank.one", 0.0f);
                this.itNum = jobConf.getInt("link.analyze.iteration", 0);
                this.limitPages = jobConf.getBoolean("link.ignore.limit.page", true);
                this.limitDomains = jobConf.getBoolean("link.ignore.limit.domain", true);
            } catch (Exception e) {
                LinkRank.LOG.error(StringUtils.stringifyException(e));
                throw new IllegalArgumentException(e);
            }
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, Writable writable, OutputCollector<Text, ObjectWritable> outputCollector, Reporter reporter) throws IOException {
            ObjectWritable objectWritable = new ObjectWritable();
            objectWritable.set(WritableUtils.clone(writable, this.conf));
            outputCollector.collect(text, objectWritable);
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<ObjectWritable> it, OutputCollector<Text, Node> outputCollector, Reporter reporter) throws IOException {
            String text2 = text.toString();
            HashSet hashSet = new HashSet();
            HashSet hashSet2 = new HashSet();
            Node node = null;
            int i = 0;
            float f = this.rankOne;
            while (it.hasNext()) {
                Object obj = it.next().get();
                if (obj instanceof Node) {
                    node = (Node) obj;
                } else if (obj instanceof LinkDatum) {
                    LinkDatum linkDatum = (LinkDatum) obj;
                    float score = linkDatum.getScore();
                    String url = linkDatum.getUrl();
                    String domainName = URLUtil.getDomainName(url);
                    String page = URLUtil.getPage(url);
                    if ((this.limitPages && hashSet2.contains(page)) || (this.limitDomains && hashSet.contains(domainName))) {
                        LinkRank.LOG.debug(text2 + ": ignoring " + score + " from " + url + ", duplicate page or domain");
                    } else {
                        i++;
                        f += score;
                        hashSet.add(domainName);
                        hashSet2.add(page);
                        LinkRank.LOG.debug(text2 + ": adding " + score + " from " + url + ", total: " + f);
                    }
                }
            }
            float f2 = (1.0f - this.dampingFactor) + (this.dampingFactor * f);
            LinkRank.LOG.debug(text2 + ": score: " + f2 + " num inlinks: " + i + " iteration: " + this.itNum);
            Node node2 = (Node) WritableUtils.clone(node, this.conf);
            node2.setInlinkScore(f2);
            outputCollector.collect(text, node2);
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() throws IOException {
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkRank$Counter.class */
    private static class Counter implements Mapper<Text, Node, Text, LongWritable>, Reducer<Text, LongWritable, Text, LongWritable> {
        private JobConf conf;
        private static Text numNodes = new Text(LinkRank.NUM_NODES);
        private static LongWritable one = new LongWritable(1);

        private Counter() {
        }

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, Node node, OutputCollector<Text, LongWritable> outputCollector, Reporter reporter) throws IOException {
            outputCollector.collect(numNodes, one);
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<LongWritable> it, OutputCollector<Text, LongWritable> outputCollector, Reporter reporter) throws IOException {
            long j = 0;
            while (true) {
                long j2 = j;
                if (!it.hasNext()) {
                    outputCollector.collect(numNodes, new LongWritable(j2));
                    return;
                }
                j = j2 + it.next().get();
            }
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkRank$Initializer.class */
    private static class Initializer implements Mapper<Text, Node, Text, Node> {
        private JobConf conf;
        private float initialScore = 1.0f;

        private Initializer() {
        }

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
            this.initialScore = jobConf.getFloat("link.analyze.initial.score", 1.0f);
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, Node node, OutputCollector<Text, Node> outputCollector, Reporter reporter) throws IOException {
            String text2 = text.toString();
            Node node2 = (Node) WritableUtils.clone(node, this.conf);
            node2.setInlinkScore(this.initialScore);
            outputCollector.collect(new Text(text2), node2);
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkRank$Inverter.class */
    private static class Inverter implements Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, LinkDatum> {
        private JobConf conf;

        private Inverter() {
        }

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, Writable writable, OutputCollector<Text, ObjectWritable> outputCollector, Reporter reporter) throws IOException {
            ObjectWritable objectWritable = new ObjectWritable();
            objectWritable.set(writable);
            outputCollector.collect(text, objectWritable);
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<ObjectWritable> it, OutputCollector<Text, LinkDatum> outputCollector, Reporter reporter) throws IOException {
            String text2 = text.toString();
            ArrayList arrayList = new ArrayList();
            Node node = null;
            Loops.LoopSet loopSet = null;
            while (it.hasNext()) {
                Object obj = it.next().get();
                if (obj instanceof Node) {
                    node = (Node) obj;
                } else if (obj instanceof LinkDatum) {
                    arrayList.add((LinkDatum) WritableUtils.clone((LinkDatum) obj, this.conf));
                } else if (obj instanceof Loops.LoopSet) {
                    loopSet = (Loops.LoopSet) obj;
                }
            }
            if (node == null && loopSet != null) {
                LinkRank.LOG.warn("LoopSet without Node object received for " + text.toString() + " . You should either not use Loops as input of the LinkRank program or rerun the Loops program over the WebGraph.");
                return;
            }
            int numOutlinks = node.getNumOutlinks();
            float inlinkScore = node.getInlinkScore();
            float outlinkScore = node.getOutlinkScore();
            LinkRank.LOG.debug(text2 + ": num outlinks " + numOutlinks);
            if (numOutlinks > 0) {
                Set<String> loopSet2 = loopSet != null ? loopSet.getLoopSet() : null;
                for (int i = 0; i < arrayList.size(); i++) {
                    LinkDatum linkDatum = (LinkDatum) arrayList.get(i);
                    String url = linkDatum.getUrl();
                    if (loopSet2 == null || !loopSet2.contains(url)) {
                        linkDatum.setUrl(text2);
                        linkDatum.setScore(outlinkScore);
                        outputCollector.collect(new Text(url), linkDatum);
                        LinkRank.LOG.debug(url + ": inverting inlink from " + text2 + " origscore: " + inlinkScore + " numOutlinks: " + numOutlinks + " inlinkscore: " + outlinkScore);
                    } else {
                        LinkRank.LOG.debug(text2 + ": Skipping inverting inlink from loop " + url);
                    }
                }
            }
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }
    }

    private int runCounter(FileSystem fileSystem, Path path) throws IOException {
        Path path2 = new Path(path, NUM_NODES);
        Path path3 = new Path(path, WebGraph.NODE_DIR);
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("LinkRank Counter");
        FileInputFormat.addInputPath(nutchJob, path3);
        FileOutputFormat.setOutputPath(nutchJob, path2);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(Counter.class);
        nutchJob.setCombinerClass(Counter.class);
        nutchJob.setReducerClass(Counter.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(LongWritable.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(LongWritable.class);
        nutchJob.setNumReduceTasks(1);
        nutchJob.setOutputFormat(TextOutputFormat.class);
        nutchJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        LOG.info("Starting link counter job");
        try {
            JobClient.runJob(nutchJob);
            LOG.info("Finished link counter job");
            LOG.info("Reading numlinks temp file");
            FSDataInputStream open = fileSystem.open(new Path(path2, "part-00000"));
            String readLine = new BufferedReader(new InputStreamReader(open)).readLine();
            open.close();
            if (readLine == null || readLine.length() == 0) {
                fileSystem.delete(path2, true);
                throw new IOException("No links to process, is the webgraph empty?");
            }
            LOG.info("Deleting numlinks temp file");
            fileSystem.delete(path2, true);
            return Integer.parseInt(readLine.split("\\s+")[1]);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    private void runInitializer(Path path, Path path2) throws IOException {
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("LinkAnalysis Initializer");
        FileInputFormat.addInputPath(nutchJob, path);
        FileOutputFormat.setOutputPath(nutchJob, path2);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(Initializer.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(Node.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(Node.class);
        nutchJob.setOutputFormat(MapFileOutputFormat.class);
        nutchJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        LOG.info("Starting initialization job");
        try {
            JobClient.runJob(nutchJob);
            LOG.info("Finished initialization job.");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    private void runInverter(Path path, Path path2, Path path3, Path path4) throws IOException {
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("LinkAnalysis Inverter");
        FileInputFormat.addInputPath(nutchJob, path);
        FileInputFormat.addInputPath(nutchJob, path2);
        if (path3 != null) {
            FileInputFormat.addInputPath(nutchJob, path3);
        }
        FileOutputFormat.setOutputPath(nutchJob, path4);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(Inverter.class);
        nutchJob.setReducerClass(Inverter.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(ObjectWritable.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(LinkDatum.class);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        nutchJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        LOG.info("Starting inverter job");
        try {
            JobClient.runJob(nutchJob);
            LOG.info("Finished inverter job.");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    private void runAnalysis(Path path, Path path2, Path path3, int i, int i2, float f) throws IOException {
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.set("link.analyze.iteration", String.valueOf(i + 1));
        nutchJob.setJobName("LinkAnalysis Analyzer, iteration " + (i + 1) + " of " + i2);
        FileInputFormat.addInputPath(nutchJob, path);
        FileInputFormat.addInputPath(nutchJob, path2);
        FileOutputFormat.setOutputPath(nutchJob, path3);
        nutchJob.set("link.analyze.rank.one", String.valueOf(f));
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(ObjectWritable.class);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(Analyzer.class);
        nutchJob.setReducerClass(Analyzer.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(Node.class);
        nutchJob.setOutputFormat(MapFileOutputFormat.class);
        nutchJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        LOG.info("Starting analysis job");
        try {
            JobClient.runJob(nutchJob);
            LOG.info("Finished analysis job.");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    public LinkRank() {
    }

    public LinkRank(Configuration configuration) {
        super(configuration);
    }

    public void close() {
    }

    public void analyze(Path path) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        LOG.info("Analysis: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
        Path path2 = new Path(path, "linkrank");
        Configuration conf = getConf();
        FileSystem fileSystem = FileSystem.get(conf);
        if (!fileSystem.exists(path2)) {
            fileSystem.mkdirs(path2);
        }
        Path path3 = new Path(path, WebGraph.OUTLINK_DIR);
        Path path4 = new Path(path, WebGraph.NODE_DIR);
        Path path5 = new Path(path2, WebGraph.NODE_DIR);
        Path path6 = new Path(path, Loops.LOOPS_DIR);
        if (!fileSystem.exists(path6)) {
            path6 = null;
        }
        int runCounter = runCounter(fileSystem, path);
        runInitializer(path4, path5);
        float f = 1.0f / runCounter;
        if (LOG.isInfoEnabled()) {
            LOG.info("Analysis: Number of links: " + runCounter);
            LOG.info("Analysis: Rank One: " + f);
        }
        int i = conf.getInt("link.analyze.num.iterations", 10);
        for (int i2 = 0; i2 < i; i2++) {
            LOG.info("Analysis: Starting iteration " + (i2 + 1) + " of " + i);
            Path path7 = new Path(path2 + Parameters.DEFAULT_OPTION_PREFIXES + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
            fileSystem.mkdirs(path7);
            Path path8 = new Path(path7, "inverted");
            Path path9 = new Path(path7, WebGraph.NODE_DIR);
            runInverter(path5, path3, path6, path8);
            runAnalysis(path5, path8, path9, i2, i, f);
            LOG.info("Analysis: Installing new link scores");
            FSUtils.replace(fileSystem, path2, path7, true);
            LOG.info("Analysis: finished iteration " + (i2 + 1) + " of " + i);
        }
        LOG.info("Analysis: Installing web graph nodes");
        FSUtils.replace(fileSystem, path4, path5, true);
        fileSystem.delete(path2, true);
        long currentTimeMillis2 = System.currentTimeMillis();
        LOG.info("Analysis: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new LinkRank(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("help");
        OptionBuilder.withDescription("show this help message");
        Option create = OptionBuilder.create("help");
        OptionBuilder.withArgName("webgraphdb");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the web graph db to use");
        Option create2 = OptionBuilder.create("webgraphdb");
        options.addOption(create);
        options.addOption(create2);
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            if (parse.hasOption("help") || !parse.hasOption("webgraphdb")) {
                new HelpFormatter().printHelp("LinkRank", options);
                return -1;
            }
            analyze(new Path(parse.getOptionValue("webgraphdb")));
            return 0;
        } catch (Exception e) {
            LOG.error("LinkAnalysis: " + StringUtils.stringifyException(e));
            return -2;
        }
    }
}
