package org.apache.nutch.scoring.webgraph;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.scoring.webgraph.Loops;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkDumper.class */
public class LinkDumper extends Configured implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(LinkDumper.class);
    public static final String DUMP_DIR = "linkdump";

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkDumper$Inverter.class */
    public static class Inverter implements Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, LinkNode> {
        private JobConf conf;

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, Writable writable, OutputCollector<Text, ObjectWritable> outputCollector, Reporter reporter) throws IOException {
            ObjectWritable objectWritable = new ObjectWritable();
            objectWritable.set(writable);
            outputCollector.collect(text, objectWritable);
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<ObjectWritable> it, OutputCollector<Text, LinkNode> outputCollector, Reporter reporter) throws IOException {
            String text2 = text.toString();
            ArrayList arrayList = new ArrayList();
            Node node = null;
            Loops.LoopSet loopSet = null;
            while (it.hasNext()) {
                Object obj = it.next().get();
                if (obj instanceof Node) {
                    node = (Node) obj;
                } else if (obj instanceof LinkDatum) {
                    arrayList.add((LinkDatum) WritableUtils.clone((LinkDatum) obj, this.conf));
                } else if (obj instanceof Loops.LoopSet) {
                    loopSet = (Loops.LoopSet) obj;
                }
            }
            if (node.getNumOutlinks() > 0) {
                Set<String> loopSet2 = loopSet != null ? loopSet.getLoopSet() : null;
                for (int i = 0; i < arrayList.size(); i++) {
                    String url = ((LinkDatum) arrayList.get(i)).getUrl();
                    if (loopSet2 == null || !loopSet2.contains(url)) {
                        outputCollector.collect(new Text(url), new LinkNode(text2, node));
                    }
                }
            }
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkDumper$LinkNode.class */
    public static class LinkNode implements Writable {
        private String url;
        private Node node;

        public LinkNode() {
            this.url = null;
            this.node = null;
        }

        public LinkNode(String str, Node node) {
            this.url = null;
            this.node = null;
            this.url = str;
            this.node = node;
        }

        public String getUrl() {
            return this.url;
        }

        public void setUrl(String str) {
            this.url = str;
        }

        public Node getNode() {
            return this.node;
        }

        public void setNode(Node node) {
            this.node = node;
        }

        @Override // org.apache.hadoop.io.Writable
        public void readFields(DataInput dataInput) throws IOException {
            this.url = dataInput.readUTF();
            this.node = new Node();
            this.node.readFields(dataInput);
        }

        @Override // org.apache.hadoop.io.Writable
        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeUTF(this.url);
            this.node.write(dataOutput);
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkDumper$LinkNodes.class */
    public static class LinkNodes implements Writable {
        private LinkNode[] links;

        public LinkNodes() {
        }

        public LinkNodes(LinkNode[] linkNodeArr) {
            this.links = linkNodeArr;
        }

        public LinkNode[] getLinks() {
            return this.links;
        }

        public void setLinks(LinkNode[] linkNodeArr) {
            this.links = linkNodeArr;
        }

        @Override // org.apache.hadoop.io.Writable
        public void readFields(DataInput dataInput) throws IOException {
            int readInt = dataInput.readInt();
            if (readInt > 0) {
                this.links = new LinkNode[readInt];
                for (int i = 0; i < readInt; i++) {
                    LinkNode linkNode = new LinkNode();
                    linkNode.readFields(dataInput);
                    this.links[i] = linkNode;
                }
            }
        }

        @Override // org.apache.hadoop.io.Writable
        public void write(DataOutput dataOutput) throws IOException {
            if (this.links == null || this.links.length <= 0) {
                return;
            }
            int length = this.links.length;
            dataOutput.writeInt(length);
            for (int i = 0; i < length; i++) {
                this.links[i].write(dataOutput);
            }
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkDumper$Merger.class */
    public static class Merger implements Reducer<Text, LinkNode, Text, LinkNodes> {
        private JobConf conf;
        private int maxInlinks = 50000;

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<LinkNode> it, OutputCollector<Text, LinkNodes> outputCollector, Reporter reporter) throws IOException {
            ArrayList arrayList = new ArrayList();
            int i = 0;
            while (it.hasNext()) {
                LinkNode next = it.next();
                if (i >= this.maxInlinks) {
                    break;
                }
                arrayList.add((LinkNode) WritableUtils.clone(next, this.conf));
                i++;
            }
            outputCollector.collect(text, new LinkNodes((LinkNode[]) arrayList.toArray(new LinkNode[arrayList.size()])));
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/LinkDumper$Reader.class */
    public static class Reader {
        public static void main(String[] strArr) throws Exception {
            if (strArr == null || strArr.length < 2) {
                System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
                return;
            }
            Configuration create = NutchConfiguration.create();
            FileSystem fileSystem = FileSystem.get(create);
            Path path = new Path(strArr[0]);
            String str = strArr[1];
            MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fileSystem, new Path(path, LinkDumper.DUMP_DIR), create);
            Text text = new Text(str);
            LinkNodes linkNodes = new LinkNodes();
            MapFileOutputFormat.getEntry(readers, new HashPartitioner(), text, linkNodes);
            LinkNode[] links = linkNodes.getLinks();
            System.out.println(str + ":");
            for (LinkNode linkNode : links) {
                System.out.println("  " + linkNode.getUrl() + " - " + linkNode.getNode().toString());
            }
            FSUtils.closeReaders(readers);
        }
    }

    public void dumpLinks(Path path) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        LOG.info("NodeDumper: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
        Configuration conf = getConf();
        FileSystem fileSystem = FileSystem.get(conf);
        Path path2 = new Path(path, DUMP_DIR);
        Path path3 = new Path(path, WebGraph.NODE_DIR);
        Path path4 = new Path(path, Loops.LOOPS_DIR);
        boolean exists = fileSystem.exists(path4);
        Path path5 = new Path(path, WebGraph.OUTLINK_DIR);
        Path path6 = new Path(path, "inverted-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob nutchJob = new NutchJob(conf);
        nutchJob.setJobName("LinkDumper: inverter");
        FileInputFormat.addInputPath(nutchJob, path3);
        if (exists) {
            FileInputFormat.addInputPath(nutchJob, path4);
        }
        FileInputFormat.addInputPath(nutchJob, path5);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(Inverter.class);
        nutchJob.setReducerClass(Inverter.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(ObjectWritable.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(LinkNode.class);
        FileOutputFormat.setOutputPath(nutchJob, path6);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        try {
            LOG.info("LinkDumper: running inverter");
            JobClient.runJob(nutchJob);
            LOG.info("LinkDumper: finished inverter");
            NutchJob nutchJob2 = new NutchJob(conf);
            nutchJob2.setJobName("LinkDumper: merger");
            FileInputFormat.addInputPath(nutchJob2, path6);
            nutchJob2.setInputFormat(SequenceFileInputFormat.class);
            nutchJob2.setReducerClass(Merger.class);
            nutchJob2.setMapOutputKeyClass(Text.class);
            nutchJob2.setMapOutputValueClass(LinkNode.class);
            nutchJob2.setOutputKeyClass(Text.class);
            nutchJob2.setOutputValueClass(LinkNodes.class);
            FileOutputFormat.setOutputPath(nutchJob2, path2);
            nutchJob2.setOutputFormat(MapFileOutputFormat.class);
            try {
                LOG.info("LinkDumper: running merger");
                JobClient.runJob(nutchJob2);
                LOG.info("LinkDumper: finished merger");
                fileSystem.delete(path6, true);
                long currentTimeMillis2 = System.currentTimeMillis();
                LOG.info("LinkDumper: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
            } catch (IOException e) {
                LOG.error(StringUtils.stringifyException(e));
                throw e;
            }
        } catch (IOException e2) {
            LOG.error(StringUtils.stringifyException(e2));
            throw e2;
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new LinkDumper(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("help");
        OptionBuilder.withDescription("show this help message");
        Option create = OptionBuilder.create("help");
        OptionBuilder.withArgName("webgraphdb");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the web graph database to use");
        Option create2 = OptionBuilder.create("webgraphdb");
        options.addOption(create);
        options.addOption(create2);
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            if (parse.hasOption("help") || !parse.hasOption("webgraphdb")) {
                new HelpFormatter().printHelp("LinkDumper", options);
                return -1;
            }
            dumpLinks(new Path(parse.getOptionValue("webgraphdb")));
            return 0;
        } catch (Exception e) {
            LOG.error("LinkDumper: " + StringUtils.stringifyException(e));
            return -2;
        }
    }
}
