package org.apache.nutch.scoring.webgraph;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Random;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/ScoreUpdater.class */
public class ScoreUpdater extends Configured implements Tool, Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, CrawlDatum> {
    public static final Logger LOG = LoggerFactory.getLogger(ScoreUpdater.class);
    private JobConf conf;
    private float clearScore = 0.0f;

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        this.conf = jobConf;
        this.clearScore = jobConf.getFloat("link.score.updater.clear.score", 0.0f);
    }

    @Override // org.apache.hadoop.mapred.Mapper
    public void map(Text text, Writable writable, OutputCollector<Text, ObjectWritable> outputCollector, Reporter reporter) throws IOException {
        ObjectWritable objectWritable = new ObjectWritable();
        objectWritable.set(writable);
        outputCollector.collect(text, objectWritable);
    }

    @Override // org.apache.hadoop.mapred.Reducer
    public void reduce(Text text, Iterator<ObjectWritable> it, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
        String text2 = text.toString();
        Node node = null;
        CrawlDatum crawlDatum = null;
        while (it.hasNext()) {
            Object obj = it.next().get();
            if (obj instanceof Node) {
                node = (Node) obj;
            } else if (obj instanceof CrawlDatum) {
                crawlDatum = (CrawlDatum) obj;
            }
        }
        if (crawlDatum == null) {
            LOG.debug(text2 + ": no datum");
            return;
        }
        if (node != null) {
            float inlinkScore = node.getInlinkScore();
            crawlDatum.setScore(inlinkScore);
            LOG.debug(text2 + ": setting to score " + inlinkScore);
        } else {
            crawlDatum.setScore(this.clearScore);
            LOG.debug(text2 + ": setting to clear score of " + this.clearScore);
        }
        outputCollector.collect(text, crawlDatum);
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() {
    }

    public void update(Path path, Path path2) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        LOG.info("ScoreUpdater: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
        Configuration conf = getConf();
        FileSystem fileSystem = FileSystem.get(conf);
        LOG.info("Running crawldb update " + path);
        Path path3 = new Path(path2, WebGraph.NODE_DIR);
        Path path4 = new Path(path, "current");
        Path path5 = new Path(path, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob nutchJob = new NutchJob(conf);
        nutchJob.setJobName("Update CrawlDb from WebGraph");
        FileInputFormat.addInputPath(nutchJob, path4);
        FileInputFormat.addInputPath(nutchJob, path3);
        FileOutputFormat.setOutputPath(nutchJob, path5);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(ScoreUpdater.class);
        nutchJob.setReducerClass(ScoreUpdater.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(ObjectWritable.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(CrawlDatum.class);
        nutchJob.setOutputFormat(MapFileOutputFormat.class);
        try {
            JobClient.runJob(nutchJob);
            LOG.info("ScoreUpdater: installing new crawldb " + path);
            CrawlDb.install(nutchJob, path);
            long currentTimeMillis2 = System.currentTimeMillis();
            LOG.info("ScoreUpdater: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            if (fileSystem.exists(path5)) {
                fileSystem.delete(path5, true);
            }
            throw e;
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new ScoreUpdater(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("help");
        OptionBuilder.withDescription("show this help message");
        Option create = OptionBuilder.create("help");
        OptionBuilder.withArgName(URLNormalizers.SCOPE_CRAWLDB);
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the crawldb to use");
        Option create2 = OptionBuilder.create(URLNormalizers.SCOPE_CRAWLDB);
        OptionBuilder.withArgName("webgraphdb");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the webgraphdb to use");
        Option create3 = OptionBuilder.create("webgraphdb");
        options.addOption(create);
        options.addOption(create2);
        options.addOption(create3);
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            if (parse.hasOption("help") || !parse.hasOption("webgraphdb") || !parse.hasOption(URLNormalizers.SCOPE_CRAWLDB)) {
                new HelpFormatter().printHelp("ScoreUpdater", options);
                return -1;
            }
            update(new Path(parse.getOptionValue(URLNormalizers.SCOPE_CRAWLDB)), new Path(parse.getOptionValue("webgraphdb")));
            return 0;
        } catch (Exception e) {
            LOG.error("ScoreUpdater: " + StringUtils.stringifyException(e));
            return -1;
        }
    }
}
