package org.apache.nutch.crawl;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.apache.tools.ant.taskdefs.optional.clearcase.CCRmtype;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.xml.XmlSuite;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator.class */
public class Generator extends Configured implements Tool {
    public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
    public static final String GENERATOR_MIN_SCORE = "generate.min.score";
    public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval";
    public static final String GENERATOR_RESTRICT_STATUS = "generate.restrict.status";
    public static final String GENERATOR_FILTER = "generate.filter";
    public static final String GENERATOR_NORMALISE = "generate.normalise";
    public static final String GENERATOR_MAX_COUNT = "generate.max.count";
    public static final String GENERATOR_COUNT_MODE = "generate.count.mode";
    public static final String GENERATOR_COUNT_VALUE_DOMAIN = "domain";
    public static final String GENERATOR_COUNT_VALUE_HOST = "host";
    public static final String GENERATOR_TOP_N = "generate.topN";
    public static final String GENERATOR_CUR_TIME = "generate.curTime";
    public static final String GENERATOR_DELAY = "crawl.gen.delay";
    public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments";
    public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
    public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
    public static final Logger LOG = LoggerFactory.getLogger(Generator.class);
    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator$CrawlDbUpdater.class */
    public static class CrawlDbUpdater extends MapReduceBase implements Mapper<Text, CrawlDatum, Text, CrawlDatum>, Reducer<Text, CrawlDatum, Text, CrawlDatum> {
        long generateTime;
        private CrawlDatum orig = new CrawlDatum();
        private LongWritable genTime = new LongWritable(0);

        @Override // org.apache.hadoop.mapred.MapReduceBase, org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.generateTime = jobConf.getLong(Nutch.GENERATE_TIME_KEY, 0L);
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, CrawlDatum crawlDatum, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
            outputCollector.collect(text, crawlDatum);
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<CrawlDatum> it, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
            this.genTime.set(0L);
            while (it.hasNext()) {
                CrawlDatum next = it.next();
                if (next.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
                    this.genTime.set(((LongWritable) next.getMetaData().get((Object) Nutch.WRITABLE_GENERATE_TIME_KEY)).get());
                    if (this.genTime.get() != this.generateTime) {
                        this.orig.set(next);
                        this.genTime.set(0L);
                    }
                } else {
                    this.orig.set(next);
                }
            }
            if (this.genTime.get() != 0) {
                this.orig.getMetaData().put((Writable) Nutch.WRITABLE_GENERATE_TIME_KEY, (Writable) this.genTime);
            }
            outputCollector.collect(text, this.orig);
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator$DecreasingFloatComparator.class */
    public static class DecreasingFloatComparator extends FloatWritable.Comparator {
        @Override // org.apache.hadoop.io.FloatWritable.Comparator, org.apache.hadoop.io.WritableComparator, org.apache.hadoop.io.RawComparator
        public int compare(byte[] bArr, int i, int i2, byte[] bArr2, int i3, int i4) {
            return super.compare(bArr2, i3, i4, bArr, i, i2);
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator$GeneratorOutputFormat.class */
    public static class GeneratorOutputFormat extends MultipleSequenceFileOutputFormat<FloatWritable, SelectorEntry> {
        /* JADX INFO: Access modifiers changed from: protected */
        @Override // org.apache.hadoop.mapred.lib.MultipleOutputFormat
        public String generateFileNameForKeyValue(FloatWritable floatWritable, SelectorEntry selectorEntry, String str) {
            return "fetchlist-" + selectorEntry.segnum.toString() + "/" + str;
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator$HashComparator.class */
    public static class HashComparator extends WritableComparator {
        public HashComparator() {
            super(Text.class);
        }

        @Override // org.apache.hadoop.io.WritableComparator
        public int compare(WritableComparable writableComparable, WritableComparable writableComparable2) {
            Text text = (Text) writableComparable;
            Text text2 = (Text) writableComparable2;
            int hash = hash(text.getBytes(), 0, text.getLength());
            int hash2 = hash(text2.getBytes(), 0, text2.getLength());
            if (hash < hash2) {
                return -1;
            }
            return hash == hash2 ? 0 : 1;
        }

        @Override // org.apache.hadoop.io.WritableComparator, org.apache.hadoop.io.RawComparator
        public int compare(byte[] bArr, int i, int i2, byte[] bArr2, int i3, int i4) {
            int hash = hash(bArr, i, i2);
            int hash2 = hash(bArr2, i3, i4);
            if (hash < hash2) {
                return -1;
            }
            return hash == hash2 ? 0 : 1;
        }

        private static int hash(byte[] bArr, int i, int i2) {
            int i3 = 1;
            for (int i4 = i2 - 1; i4 >= 0; i4--) {
                i3 = (31 * i3) + bArr[i + i4];
            }
            return i3;
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator$PartitionReducer.class */
    public static class PartitionReducer extends MapReduceBase implements Reducer<Text, SelectorEntry, Text, CrawlDatum> {
        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<SelectorEntry> it, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
            while (it.hasNext()) {
                SelectorEntry next = it.next();
                outputCollector.collect(next.url, next.datum);
            }
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator$Selector.class */
    public static class Selector implements Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>, Partitioner<FloatWritable, Writable>, Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> {
        private long curTime;
        private long limit;
        private long count;
        private int[] segCounts;
        private int maxCount;
        private URLFilters filters;
        private URLNormalizers normalizers;
        private ScoringFilters scfilters;
        private boolean filter;
        private boolean normalise;
        private long genDelay;
        private FetchSchedule schedule;
        private LongWritable genTime = new LongWritable(System.currentTimeMillis());
        private HashMap<String, int[]> hostCounts = new HashMap<>();
        private boolean byDomain = false;
        private Partitioner<Text, Writable> partitioner = new URLPartitioner();
        private SelectorEntry entry = new SelectorEntry();
        private FloatWritable sortValue = new FloatWritable();
        private float scoreThreshold = 0.0f;
        private int intervalThreshold = -1;
        private String restrictStatus = null;
        private int maxNumSegments = 1;
        int currentsegmentnum = 1;

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.curTime = jobConf.getLong(Generator.GENERATOR_CUR_TIME, System.currentTimeMillis());
            this.limit = jobConf.getLong(Generator.GENERATOR_TOP_N, FSConstants.QUOTA_DONT_SET) / jobConf.getNumReduceTasks();
            this.maxCount = jobConf.getInt(Generator.GENERATOR_MAX_COUNT, -1);
            int i = jobConf.getInt(Generator.GENERATE_MAX_PER_HOST, -1);
            if (this.maxCount == -1 && i != -1) {
                this.maxCount = i;
                this.byDomain = false;
            }
            if ("domain".equals(jobConf.get(Generator.GENERATOR_COUNT_MODE))) {
                this.byDomain = true;
            }
            this.filters = new URLFilters(jobConf);
            this.normalise = jobConf.getBoolean(Generator.GENERATOR_NORMALISE, true);
            if (this.normalise) {
                this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
            }
            this.scfilters = new ScoringFilters(jobConf);
            this.partitioner.configure(jobConf);
            this.filter = jobConf.getBoolean(Generator.GENERATOR_FILTER, true);
            this.genDelay = jobConf.getLong(Generator.GENERATOR_DELAY, 7L) * 3600 * 24 * 1000;
            long j = jobConf.getLong(Nutch.GENERATE_TIME_KEY, 0L);
            if (j > 0) {
                this.genTime.set(j);
            }
            this.schedule = FetchScheduleFactory.getFetchSchedule(jobConf);
            this.scoreThreshold = jobConf.getFloat(Generator.GENERATOR_MIN_SCORE, Float.NaN);
            this.intervalThreshold = jobConf.getInt(Generator.GENERATOR_MIN_INTERVAL, -1);
            this.restrictStatus = jobConf.get(Generator.GENERATOR_RESTRICT_STATUS, null);
            this.maxNumSegments = jobConf.getInt(Generator.GENERATOR_MAX_NUM_SEGMENTS, 1);
            this.segCounts = new int[this.maxNumSegments];
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, CrawlDatum crawlDatum, OutputCollector<FloatWritable, SelectorEntry> outputCollector, Reporter reporter) throws IOException {
            if (this.filter) {
                try {
                    if (this.filters.filter(text.toString()) == null) {
                        return;
                    }
                } catch (URLFilterException e) {
                    if (Generator.LOG.isWarnEnabled()) {
                        Generator.LOG.warn("Couldn't filter url: " + text + " (" + e.getMessage() + ")");
                    }
                }
            }
            if (!this.schedule.shouldFetch(text, crawlDatum, this.curTime)) {
                Generator.LOG.debug("-shouldFetch rejected '" + text + "', fetchTime=" + crawlDatum.getFetchTime() + ", curTime=" + this.curTime);
                return;
            }
            LongWritable longWritable = (LongWritable) crawlDatum.getMetaData().get((Object) Nutch.WRITABLE_GENERATE_TIME_KEY);
            if (longWritable == null || longWritable.get() + this.genDelay <= this.curTime) {
                float f = 1.0f;
                try {
                    f = this.scfilters.generatorSortValue(text, crawlDatum, 1.0f);
                } catch (ScoringFilterException e2) {
                    if (Generator.LOG.isWarnEnabled()) {
                        Generator.LOG.warn("Couldn't filter generatorSortValue for " + text + ": " + e2);
                    }
                }
                if (this.restrictStatus == null || this.restrictStatus.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) {
                    if (this.scoreThreshold == Float.NaN || f >= this.scoreThreshold) {
                        if (this.intervalThreshold == -1 || crawlDatum.getFetchInterval() <= this.intervalThreshold) {
                            this.sortValue.set(f);
                            crawlDatum.getMetaData().put((Writable) Nutch.WRITABLE_GENERATE_TIME_KEY, (Writable) this.genTime);
                            this.entry.datum = crawlDatum;
                            this.entry.url = text;
                            outputCollector.collect(this.sortValue, this.entry);
                        }
                    }
                }
            }
        }

        @Override // org.apache.hadoop.mapred.Partitioner
        public int getPartition(FloatWritable floatWritable, Writable writable, int i) {
            return this.partitioner.getPartition(((SelectorEntry) writable).url, floatWritable, i);
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(FloatWritable floatWritable, Iterator<SelectorEntry> it, OutputCollector<FloatWritable, SelectorEntry> outputCollector, Reporter reporter) throws IOException {
            String lowerCase;
            while (it.hasNext()) {
                if (this.count == this.limit) {
                    if (this.currentsegmentnum >= this.maxNumSegments) {
                        return;
                    }
                    this.count = 0L;
                    this.currentsegmentnum++;
                }
                SelectorEntry next = it.next();
                String text = next.url.toString();
                try {
                    if (this.normalise && this.normalizers != null) {
                        text = this.normalizers.normalize(text, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
                    }
                    lowerCase = (this.byDomain ? URLUtil.getDomainName(new URL(text)) : new URL(text).getHost()).toLowerCase();
                } catch (Exception e) {
                    Generator.LOG.warn("Malformed URL: '" + text + "', skipping (" + StringUtils.stringifyException(e) + ")");
                }
                if (this.maxCount > 0) {
                    int[] iArr = this.hostCounts.get(lowerCase);
                    if (iArr == null) {
                        iArr = new int[]{1, 0};
                        this.hostCounts.put(lowerCase, iArr);
                    }
                    int[] iArr2 = iArr;
                    iArr2[1] = iArr2[1] + 1;
                    while (this.segCounts[iArr[0] - 1] >= this.limit && iArr[0] < this.maxNumSegments) {
                        int[] iArr3 = iArr;
                        iArr3[0] = iArr3[0] + 1;
                        iArr[1] = 0;
                    }
                    if (iArr[1] >= this.maxCount) {
                        if (iArr[0] < this.maxNumSegments) {
                            int[] iArr4 = iArr;
                            iArr4[0] = iArr4[0] + 1;
                            iArr[1] = 0;
                        } else if (iArr[1] == this.maxCount + 1 && Generator.LOG.isInfoEnabled()) {
                            Generator.LOG.info("Host or domain " + lowerCase + " has more than " + this.maxCount + " URLs for all " + this.maxNumSegments + " segments. Additional URLs won't be included in the fetchlist.");
                        }
                    }
                    next.segnum = new IntWritable(iArr[0]);
                    int[] iArr5 = this.segCounts;
                    int i = iArr[0] - 1;
                    iArr5[i] = iArr5[i] + 1;
                } else {
                    next.segnum = new IntWritable(this.currentsegmentnum);
                    int[] iArr6 = this.segCounts;
                    int i2 = this.currentsegmentnum - 1;
                    iArr6[i2] = iArr6[i2] + 1;
                }
                outputCollector.collect(floatWritable, next);
                this.count++;
            }
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator$SelectorEntry.class */
    public static class SelectorEntry implements Writable {
        public Text url = new Text();
        public CrawlDatum datum = new CrawlDatum();
        public IntWritable segnum = new IntWritable(0);

        @Override // org.apache.hadoop.io.Writable
        public void readFields(DataInput dataInput) throws IOException {
            this.url.readFields(dataInput);
            this.datum.readFields(dataInput);
            this.segnum.readFields(dataInput);
        }

        @Override // org.apache.hadoop.io.Writable
        public void write(DataOutput dataOutput) throws IOException {
            this.url.write(dataOutput);
            this.datum.write(dataOutput);
            this.segnum.write(dataOutput);
        }

        public String toString() {
            return "url=" + this.url.toString() + ", datum=" + this.datum.toString() + ", segnum=" + this.segnum.toString();
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Generator$SelectorInverseMapper.class */
    public static class SelectorInverseMapper extends MapReduceBase implements Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> {
        @Override // org.apache.hadoop.mapred.Mapper
        public void map(FloatWritable floatWritable, SelectorEntry selectorEntry, OutputCollector<Text, SelectorEntry> outputCollector, Reporter reporter) throws IOException {
            outputCollector.collect(selectorEntry.url, selectorEntry);
        }
    }

    public Generator() {
    }

    public Generator(Configuration configuration) {
        setConf(configuration);
    }

    public Path[] generate(Path path, Path path2, int i, long j, long j2) throws IOException {
        NutchJob nutchJob = new NutchJob(getConf());
        return generate(path, path2, i, j, j2, nutchJob.getBoolean(GENERATOR_FILTER, true), nutchJob.getBoolean(GENERATOR_NORMALISE, true), false, 1);
    }

    public Path[] generate(Path path, Path path2, int i, long j, long j2, boolean z, boolean z2) throws IOException {
        return generate(path, path2, i, j, j2, z, true, z2, 1);
    }

    public Path[] generate(Path path, Path path2, int i, long j, long j2, boolean z, boolean z2, boolean z3, int i2) throws IOException {
        Path path3 = new Path(getConf().get("mapred.temp.dir", Path.CUR_DIR) + "/generate-temp-" + System.currentTimeMillis());
        Path path4 = new Path(path, ".locked");
        FileSystem fileSystem = FileSystem.get(getConf());
        LockUtil.createLockFile(fileSystem, path4, z3);
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        LOG.info("Generator: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
        LOG.info("Generator: Selecting best-scoring urls due for fetch.");
        LOG.info("Generator: filtering: " + z);
        LOG.info("Generator: normalizing: " + z2);
        if (j != FSConstants.QUOTA_DONT_SET) {
            LOG.info("Generator: topN: " + j);
        }
        if (XmlSuite.DEFAULT_PRESERVE_ORDER.equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("generate: select from " + path);
        if (i == -1) {
            i = nutchJob.getNumMapTasks();
        }
        if ("local".equals(nutchJob.get("mapred.job.tracker")) && i != 1) {
            LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
            i = 1;
        }
        nutchJob.setLong(GENERATOR_CUR_TIME, j2);
        long currentTimeMillis2 = System.currentTimeMillis();
        nutchJob.setLong(Nutch.GENERATE_TIME_KEY, currentTimeMillis2);
        nutchJob.setLong(GENERATOR_TOP_N, j);
        nutchJob.setBoolean(GENERATOR_FILTER, z);
        nutchJob.setBoolean(GENERATOR_NORMALISE, z2);
        nutchJob.setInt(GENERATOR_MAX_NUM_SEGMENTS, i2);
        FileInputFormat.addInputPath(nutchJob, new Path(path, "current"));
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(Selector.class);
        nutchJob.setPartitionerClass(Selector.class);
        nutchJob.setReducerClass(Selector.class);
        FileOutputFormat.setOutputPath(nutchJob, path3);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        nutchJob.setOutputKeyClass(FloatWritable.class);
        nutchJob.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        nutchJob.setOutputValueClass(SelectorEntry.class);
        nutchJob.setOutputFormat(GeneratorOutputFormat.class);
        try {
            JobClient.runJob(nutchJob);
            ArrayList arrayList = new ArrayList();
            try {
                for (FileStatus fileStatus : fileSystem.listStatus(path3)) {
                    Path path5 = fileStatus.getPath();
                    if (path5.getName().startsWith("fetchlist-")) {
                        arrayList.add(partitionSegment(fileSystem, path2, path5, i));
                    }
                }
                if (arrayList.size() == 0) {
                    LOG.warn("Generator: 0 records selected for fetching, exiting ...");
                    LockUtil.removeLockFile(fileSystem, path4);
                    fileSystem.delete(path3, true);
                    return null;
                }
                if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
                    Path path6 = new Path(getConf().get("mapred.temp.dir", Path.CUR_DIR) + "/generate-temp-" + System.currentTimeMillis());
                    NutchJob nutchJob2 = new NutchJob(getConf());
                    nutchJob2.setJobName("generate: updatedb " + path);
                    nutchJob2.setLong(Nutch.GENERATE_TIME_KEY, currentTimeMillis2);
                    Iterator it = arrayList.iterator();
                    while (it.hasNext()) {
                        FileInputFormat.addInputPath(nutchJob2, new Path((Path) it.next(), CrawlDatum.GENERATE_DIR_NAME));
                    }
                    FileInputFormat.addInputPath(nutchJob2, new Path(path, "current"));
                    nutchJob2.setInputFormat(SequenceFileInputFormat.class);
                    nutchJob2.setMapperClass(CrawlDbUpdater.class);
                    nutchJob2.setReducerClass(CrawlDbUpdater.class);
                    nutchJob2.setOutputFormat(MapFileOutputFormat.class);
                    nutchJob2.setOutputKeyClass(Text.class);
                    nutchJob2.setOutputValueClass(CrawlDatum.class);
                    FileOutputFormat.setOutputPath(nutchJob2, path6);
                    try {
                        JobClient.runJob(nutchJob2);
                        CrawlDb.install(nutchJob2, path);
                        fileSystem.delete(path6, true);
                    } catch (IOException e) {
                        LockUtil.removeLockFile(fileSystem, path4);
                        fileSystem.delete(path3, true);
                        fileSystem.delete(path6, true);
                        throw e;
                    }
                }
                LockUtil.removeLockFile(fileSystem, path4);
                fileSystem.delete(path3, true);
                long currentTimeMillis3 = System.currentTimeMillis();
                LOG.info("Generator: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis3)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis3));
                return (Path[]) arrayList.toArray(new Path[arrayList.size()]);
            } catch (Exception e2) {
                LOG.warn("Generator: exception while partitioning segments, exiting ...");
                fileSystem.delete(path3, true);
                return null;
            }
        } catch (IOException e3) {
            throw e3;
        }
    }

    private Path partitionSegment(FileSystem fileSystem, Path path, Path path2, int i) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("Generator: Partitioning selected urls for politeness.");
        }
        Path path3 = new Path(path, generateSegmentName());
        Path path4 = new Path(path3, CrawlDatum.GENERATE_DIR_NAME);
        LOG.info("Generator: segment: " + path3);
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("generate: partition " + path3);
        nutchJob.setInt("partition.url.seed", new Random().nextInt());
        FileInputFormat.addInputPath(nutchJob, path2);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(SelectorInverseMapper.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(SelectorEntry.class);
        nutchJob.setPartitionerClass(URLPartitioner.class);
        nutchJob.setReducerClass(PartitionReducer.class);
        nutchJob.setNumReduceTasks(i);
        FileOutputFormat.setOutputPath(nutchJob, path4);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(CrawlDatum.class);
        nutchJob.setOutputKeyComparatorClass(HashComparator.class);
        JobClient.runJob(nutchJob);
        return path3;
    }

    public static synchronized String generateSegmentName() {
        try {
            Thread.sleep(1000L);
        } catch (Throwable th) {
        }
        return sdf.format(new Date(System.currentTimeMillis()));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new Generator(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length < 2) {
            System.out.println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter] [-noNorm][-maxNumSegments num]");
            return -1;
        }
        Path path = new Path(strArr[0]);
        Path path2 = new Path(strArr[1]);
        long currentTimeMillis = System.currentTimeMillis();
        long j = Long.MAX_VALUE;
        int i = -1;
        boolean z = true;
        boolean z2 = true;
        boolean z3 = false;
        int i2 = 1;
        int i3 = 2;
        while (i3 < strArr.length) {
            if ("-topN".equals(strArr[i3])) {
                j = Long.parseLong(strArr[i3 + 1]);
                i3++;
            } else if ("-numFetchers".equals(strArr[i3])) {
                i = Integer.parseInt(strArr[i3 + 1]);
                i3++;
            } else if ("-adddays".equals(strArr[i3])) {
                currentTimeMillis += Integer.parseInt(strArr[i3 + 1]) * 1000 * 60 * 60 * 24;
            } else if ("-noFilter".equals(strArr[i3])) {
                z = false;
            } else if ("-noNorm".equals(strArr[i3])) {
                z2 = false;
            } else if (CCRmtype.FLAG_FORCE.equals(strArr[i3])) {
                z3 = true;
            } else if ("-maxNumSegments".equals(strArr[i3])) {
                i2 = Integer.parseInt(strArr[i3 + 1]);
            }
            i3++;
        }
        try {
            return generate(path, path2, i, j, currentTimeMillis, z, z2, z3, i2) == null ? -1 : 0;
        } catch (Exception e) {
            LOG.error("Generator: " + StringUtils.stringifyException(e));
            return -1;
        }
    }
}
