package org.apache.nutch.scoring.webgraph;

import com.beust.jcommander.Parameters;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Random;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/WebGraph.class */
public class WebGraph extends Configured implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(WebGraph.class);
    public static final String LOCK_NAME = ".locked";
    public static final String INLINK_DIR = "inlinks";
    public static final String OUTLINK_DIR = "outlinks";
    public static final String NODE_DIR = "nodes";

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/WebGraph$InlinkDb.class */
    private static class InlinkDb extends Configured implements Mapper<Text, LinkDatum, Text, LinkDatum> {
        private JobConf conf;
        private long timestamp;

        public InlinkDb() {
        }

        public InlinkDb(Configuration configuration) {
            setConf(configuration);
        }

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
            this.timestamp = System.currentTimeMillis();
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, LinkDatum linkDatum, OutputCollector<Text, LinkDatum> outputCollector, Reporter reporter) throws IOException {
            String text2 = text.toString();
            String url = linkDatum.getUrl();
            LinkDatum linkDatum2 = new LinkDatum(text2, linkDatum.getAnchor(), this.timestamp);
            linkDatum2.setLinkType((byte) 1);
            outputCollector.collect(new Text(url), linkDatum2);
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/WebGraph$NodeDb.class */
    private static class NodeDb extends Configured implements Reducer<Text, LinkDatum, Text, Node> {
        private JobConf conf;

        public NodeDb() {
        }

        public NodeDb(Configuration configuration) {
            setConf(configuration);
        }

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<LinkDatum> it, OutputCollector<Text, Node> outputCollector, Reporter reporter) throws IOException {
            Node node = new Node();
            int i = 0;
            int i2 = 0;
            while (it.hasNext()) {
                LinkDatum next = it.next();
                if (next.getLinkType() == 1) {
                    i++;
                } else if (next.getLinkType() == 2) {
                    i2++;
                }
            }
            node.setNumInlinks(i);
            node.setNumOutlinks(i2);
            node.setInlinkScore(0.0f);
            outputCollector.collect(text, node);
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/WebGraph$OutlinkDb.class */
    public static class OutlinkDb extends Configured implements Mapper<Text, Writable, Text, NutchWritable>, Reducer<Text, NutchWritable, Text, LinkDatum> {
        public static final String URL_NORMALIZING = "webgraph.url.normalizers";
        public static final String URL_FILTERING = "webgraph.url.filters";
        private boolean ignoreDomain = true;
        private boolean ignoreHost = true;
        private boolean limitPages = true;
        private boolean limitDomains = true;
        private boolean normalize = false;
        private boolean filter = false;
        private URLNormalizers urlNormalizers;
        private URLFilters filters;
        private JobConf conf;

        private String normalizeUrl(String str) {
            if (!this.normalize) {
                return str;
            }
            String str2 = null;
            if (this.urlNormalizers != null) {
                try {
                    str2 = this.urlNormalizers.normalize(str, "default").trim();
                } catch (Exception e) {
                    WebGraph.LOG.warn("Skipping " + str + ":" + e);
                    str2 = null;
                }
            }
            return str2;
        }

        private String filterUrl(String str) {
            String str2;
            if (!this.filter) {
                return str;
            }
            try {
                str2 = this.filters.filter(str);
            } catch (Exception e) {
                str2 = null;
            }
            return str2;
        }

        private long getFetchTime(ParseData parseData) {
            long currentTimeMillis;
            System.currentTimeMillis();
            try {
                currentTimeMillis = Long.parseLong(parseData.getContentMeta().get(Nutch.FETCH_TIME_KEY));
            } catch (Exception e) {
                currentTimeMillis = System.currentTimeMillis();
            }
            return currentTimeMillis;
        }

        public OutlinkDb() {
        }

        public OutlinkDb(Configuration configuration) {
            setConf(configuration);
        }

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
            this.ignoreHost = jobConf.getBoolean("link.ignore.internal.host", true);
            this.ignoreDomain = jobConf.getBoolean("link.ignore.internal.domain", true);
            this.limitPages = jobConf.getBoolean("link.ignore.limit.page", true);
            this.limitDomains = jobConf.getBoolean("link.ignore.limit.domain", true);
            this.normalize = jobConf.getBoolean(URL_NORMALIZING, false);
            this.filter = jobConf.getBoolean(URL_FILTERING, false);
            if (this.normalize) {
                this.urlNormalizers = new URLNormalizers(jobConf, "default");
            }
            if (this.filter) {
                this.filters = new URLFilters(jobConf);
            }
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, Writable writable, OutputCollector<Text, NutchWritable> outputCollector, Reporter reporter) throws IOException {
            String normalizeUrl = normalizeUrl(text.toString());
            if (normalizeUrl == null || filterUrl(normalizeUrl) == null) {
                return;
            }
            text.set(normalizeUrl);
            if (writable instanceof CrawlDatum) {
                CrawlDatum crawlDatum = (CrawlDatum) writable;
                if (crawlDatum.getStatus() == 35 || crawlDatum.getStatus() == 36 || crawlDatum.getStatus() == 37) {
                    outputCollector.collect(text, new NutchWritable(new BooleanWritable(true)));
                    return;
                }
                return;
            }
            if (!(writable instanceof ParseData)) {
                if (writable instanceof LinkDatum) {
                    LinkDatum linkDatum = (LinkDatum) writable;
                    String normalizeUrl2 = normalizeUrl(linkDatum.getUrl());
                    if (filterUrl(normalizeUrl2) != null) {
                        linkDatum.setUrl(normalizeUrl2);
                        outputCollector.collect(text, new NutchWritable(linkDatum));
                        return;
                    }
                    return;
                }
                return;
            }
            ParseData parseData = (ParseData) writable;
            long fetchTime = getFetchTime(parseData);
            Outlink[] outlinks = parseData.getOutlinks();
            LinkedHashMap linkedHashMap = new LinkedHashMap();
            if (outlinks != null && outlinks.length > 0) {
                for (Outlink outlink : outlinks) {
                    String normalizeUrl3 = normalizeUrl(outlink.getToUrl());
                    if (filterUrl(normalizeUrl3) != null) {
                        boolean containsKey = linkedHashMap.containsKey(normalizeUrl3);
                        if (normalizeUrl3 != null && (!containsKey || (containsKey && linkedHashMap.get(normalizeUrl3) == null))) {
                            linkedHashMap.put(normalizeUrl3, outlink.getAnchor());
                        }
                    }
                }
            }
            for (String str : linkedHashMap.keySet()) {
                outputCollector.collect(text, new NutchWritable(new LinkDatum(str, (String) linkedHashMap.get(str), fetchTime)));
            }
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<NutchWritable> it, OutputCollector<Text, LinkDatum> outputCollector, Reporter reporter) throws IOException {
            long j = 0;
            ArrayList<LinkDatum> arrayList = new ArrayList();
            while (it.hasNext()) {
                Writable writable = it.next().get();
                if (writable instanceof LinkDatum) {
                    LinkDatum linkDatum = (LinkDatum) writable;
                    long timestamp = linkDatum.getTimestamp();
                    if (j == 0 || j < timestamp) {
                        j = timestamp;
                    }
                    arrayList.add((LinkDatum) WritableUtils.clone(linkDatum, this.conf));
                    reporter.incrCounter("WebGraph.outlinks", "added links", 1L);
                } else if ((writable instanceof BooleanWritable) && ((BooleanWritable) writable).get()) {
                    reporter.incrCounter("WebGraph.outlinks", "removed links", 1L);
                    return;
                }
            }
            String text2 = text.toString();
            String domainName = URLUtil.getDomainName(text2);
            String host = URLUtil.getHost(text2);
            HashSet hashSet = new HashSet();
            HashSet hashSet2 = new HashSet();
            for (LinkDatum linkDatum2 : arrayList) {
                String url = linkDatum2.getUrl();
                String domainName2 = URLUtil.getDomainName(url);
                String host2 = URLUtil.getHost(url);
                String page = URLUtil.getPage(url);
                linkDatum2.setLinkType((byte) 2);
                if (linkDatum2.getTimestamp() == j && (!this.limitPages || (this.limitPages && !hashSet2.contains(page)))) {
                    if (!this.limitDomains || (this.limitDomains && !hashSet.contains(domainName2))) {
                        if (!this.ignoreHost || (this.ignoreHost && !host2.equalsIgnoreCase(host))) {
                            if (!this.ignoreDomain || (this.ignoreDomain && !domainName2.equalsIgnoreCase(domainName))) {
                                outputCollector.collect(text, linkDatum2);
                                hashSet2.add(page);
                                hashSet.add(domainName2);
                            }
                        }
                    }
                }
            }
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }
    }

    public void createWebGraph(Path path, Path[] pathArr, boolean z, boolean z2) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("WebGraphDb: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
            LOG.info("WebGraphDb: webgraphdb: " + path);
            LOG.info("WebGraphDb: URL normalize: " + z);
            LOG.info("WebGraphDb: URL filter: " + z2);
        }
        Configuration conf = getConf();
        FileSystem fileSystem = FileSystem.get(conf);
        Path path2 = new Path(path, ".locked");
        if (!fileSystem.exists(path)) {
            fileSystem.mkdirs(path);
        }
        LockUtil.createLockFile(fileSystem, path2, false);
        Path path3 = new Path(path, OUTLINK_DIR);
        if (!fileSystem.exists(path3)) {
            fileSystem.mkdirs(path3);
        }
        Path path4 = new Path(path3 + Parameters.DEFAULT_OPTION_PREFIXES + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob nutchJob = new NutchJob(conf);
        nutchJob.setJobName("Outlinkdb: " + path3);
        boolean z3 = conf.getBoolean("link.delete.gone", false);
        if (z3) {
            LOG.info("OutlinkDb: deleting gone links");
        }
        if (pathArr != null) {
            for (int i = 0; i < pathArr.length; i++) {
                Path path5 = new Path(pathArr[i], ParseData.DIR_NAME);
                if (fileSystem.exists(path5)) {
                    LOG.info("OutlinkDb: adding input: " + path5);
                    FileInputFormat.addInputPath(nutchJob, path5);
                }
                if (z3) {
                    Path path6 = new Path(pathArr[i], CrawlDatum.FETCH_DIR_NAME);
                    if (fileSystem.exists(path6)) {
                        LOG.info("OutlinkDb: adding input: " + path6);
                        FileInputFormat.addInputPath(nutchJob, path6);
                    }
                }
            }
        }
        LOG.info("OutlinkDb: adding input: " + path3);
        FileInputFormat.addInputPath(nutchJob, path3);
        nutchJob.setBoolean(OutlinkDb.URL_NORMALIZING, z);
        nutchJob.setBoolean(OutlinkDb.URL_FILTERING, z2);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(OutlinkDb.class);
        nutchJob.setReducerClass(OutlinkDb.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(NutchWritable.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(LinkDatum.class);
        FileOutputFormat.setOutputPath(nutchJob, path4);
        nutchJob.setOutputFormat(MapFileOutputFormat.class);
        nutchJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        try {
            LOG.info("OutlinkDb: running");
            JobClient.runJob(nutchJob);
            LOG.info("OutlinkDb: installing " + path3);
            FSUtils.replace(fileSystem, path3, path4, true);
            LOG.info("OutlinkDb: finished");
            Path path7 = new Path(path, INLINK_DIR);
            Path path8 = new Path(path7 + Parameters.DEFAULT_OPTION_PREFIXES + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
            NutchJob nutchJob2 = new NutchJob(conf);
            nutchJob2.setJobName("Inlinkdb " + path7);
            LOG.info("InlinkDb: adding input: " + path3);
            FileInputFormat.addInputPath(nutchJob2, path3);
            nutchJob2.setInputFormat(SequenceFileInputFormat.class);
            nutchJob2.setMapperClass(InlinkDb.class);
            nutchJob2.setMapOutputKeyClass(Text.class);
            nutchJob2.setMapOutputValueClass(LinkDatum.class);
            nutchJob2.setOutputKeyClass(Text.class);
            nutchJob2.setOutputValueClass(LinkDatum.class);
            FileOutputFormat.setOutputPath(nutchJob2, path8);
            nutchJob2.setOutputFormat(MapFileOutputFormat.class);
            nutchJob2.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
            try {
                LOG.info("InlinkDb: running");
                JobClient.runJob(nutchJob2);
                LOG.info("InlinkDb: installing " + path7);
                FSUtils.replace(fileSystem, path7, path8, true);
                LOG.info("InlinkDb: finished");
                Path path9 = new Path(path, NODE_DIR);
                Path path10 = new Path(path9 + Parameters.DEFAULT_OPTION_PREFIXES + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
                NutchJob nutchJob3 = new NutchJob(conf);
                nutchJob3.setJobName("NodeDb " + path9);
                LOG.info("NodeDb: adding input: " + path3);
                LOG.info("NodeDb: adding input: " + path7);
                FileInputFormat.addInputPath(nutchJob3, path3);
                FileInputFormat.addInputPath(nutchJob3, path7);
                nutchJob3.setInputFormat(SequenceFileInputFormat.class);
                nutchJob3.setReducerClass(NodeDb.class);
                nutchJob3.setMapOutputKeyClass(Text.class);
                nutchJob3.setMapOutputValueClass(LinkDatum.class);
                nutchJob3.setOutputKeyClass(Text.class);
                nutchJob3.setOutputValueClass(Node.class);
                FileOutputFormat.setOutputPath(nutchJob3, path10);
                nutchJob3.setOutputFormat(MapFileOutputFormat.class);
                nutchJob3.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
                try {
                    LOG.info("NodeDb: running");
                    JobClient.runJob(nutchJob3);
                    LOG.info("NodeDb: installing " + path9);
                    FSUtils.replace(fileSystem, path9, path10, true);
                    LOG.info("NodeDb: finished");
                    LockUtil.removeLockFile(fileSystem, path2);
                    long currentTimeMillis2 = System.currentTimeMillis();
                    LOG.info("WebGraphDb: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
                } catch (IOException e) {
                    LockUtil.removeLockFile(fileSystem, path2);
                    if (fileSystem.exists(path10)) {
                        fileSystem.delete(path10, true);
                    }
                    LOG.error(StringUtils.stringifyException(e));
                    throw e;
                }
            } catch (IOException e2) {
                LockUtil.removeLockFile(fileSystem, path2);
                if (fileSystem.exists(path8)) {
                    fileSystem.delete(path8, true);
                }
                LOG.error(StringUtils.stringifyException(e2));
                throw e2;
            }
        } catch (IOException e3) {
            LockUtil.removeLockFile(fileSystem, path2);
            if (fileSystem.exists(path4)) {
                fileSystem.delete(path4, true);
            }
            LOG.error(StringUtils.stringifyException(e3));
            throw e3;
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new WebGraph(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("help");
        OptionBuilder.withDescription("show this help message");
        Option create = OptionBuilder.create("help");
        OptionBuilder.withArgName("webgraphdb");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the web graph database to use");
        Option create2 = OptionBuilder.create("webgraphdb");
        OptionBuilder.withArgName("segment");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("the segment(s) to use");
        Option create3 = OptionBuilder.create("segment");
        OptionBuilder.withArgName("segmentDir");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("the segment directory to use");
        Option create4 = OptionBuilder.create("segmentDir");
        OptionBuilder.withArgName("normalize");
        OptionBuilder.withDescription("whether to use URLNormalizers on the URL's in the segment");
        Option create5 = OptionBuilder.create("normalize");
        OptionBuilder.withArgName("filter");
        OptionBuilder.withDescription("whether to use URLFilters on the URL's in the segment");
        Option create6 = OptionBuilder.create("filter");
        options.addOption(create);
        options.addOption(create2);
        options.addOption(create3);
        options.addOption(create4);
        options.addOption(create5);
        options.addOption(create6);
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            if (parse.hasOption("help") || !parse.hasOption("webgraphdb") || (!parse.hasOption("segment") && !parse.hasOption("segmentDir"))) {
                new HelpFormatter().printHelp("WebGraph", options);
                return -1;
            }
            String optionValue = parse.getOptionValue("webgraphdb");
            Path[] pathArr = null;
            if (parse.hasOption("segment")) {
                String[] optionValues = parse.getOptionValues("segment");
                pathArr = new Path[optionValues.length];
                for (int i = 0; i < optionValues.length; i++) {
                    pathArr[i] = new Path(optionValues[i]);
                }
            }
            if (parse.hasOption("segmentDir")) {
                Path path = new Path(parse.getOptionValue("segmentDir"));
                FileSystem fileSystem = path.getFileSystem(getConf());
                pathArr = HadoopFSUtil.getPaths(fileSystem.listStatus(path, HadoopFSUtil.getPassDirectoriesFilter(fileSystem)));
            }
            createWebGraph(new Path(optionValue), pathArr, parse.hasOption("normalize"), parse.hasOption("filter"));
            return 0;
        } catch (Exception e) {
            LOG.error("WebGraph: " + StringUtils.stringifyException(e));
            return -2;
        }
    }
}
