package org.apache.nutch.fetcher;

import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.jempbox.xmp.ResourceEvent;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import shaded.org.sonatype.plexus.components.sec.dispatcher.SecUtil;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/fetcher/Fetcher.class */
public class Fetcher extends Configured implements Tool, MapRunnable<Text, CrawlDatum, Text, NutchWritable> {
    public static final int PERM_REFRESH_TIME = 5;
    public static final String CONTENT_REDIR = "content";
    public static final String PROTOCOL_REDIR = "protocol";
    public static final Logger LOG = LoggerFactory.getLogger(Fetcher.class);
    private OutputCollector<Text, NutchWritable> output;
    private Reporter reporter;
    private String segmentName;
    private AtomicInteger activeThreads;
    private AtomicInteger spinWaiting;
    private long start;
    private AtomicLong lastRequestStart;
    private AtomicLong bytes;
    private AtomicInteger pages;
    private AtomicInteger errors;
    private boolean storingContent;
    private boolean parsing;
    FetchItemQueues fetchQueues;
    QueueFeeder feeder;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/fetcher/Fetcher$FetchItem.class */
    public static class FetchItem {
        int outlinkDepth;
        String queueID;
        Text url;
        URL u;
        CrawlDatum datum;

        public FetchItem(Text text, URL url, CrawlDatum crawlDatum, String str) {
            this(text, url, crawlDatum, str, 0);
        }

        public FetchItem(Text text, URL url, CrawlDatum crawlDatum, String str, int i) {
            this.outlinkDepth = 0;
            this.url = text;
            this.u = url;
            this.datum = crawlDatum;
            this.queueID = str;
            this.outlinkDepth = i;
        }

        public static FetchItem create(Text text, CrawlDatum crawlDatum, String str) {
            return create(text, crawlDatum, str, 0);
        }

        public static FetchItem create(Text text, CrawlDatum crawlDatum, String str, int i) {
            String hostAddress;
            try {
                URL url = new URL(text.toString());
                String lowerCase = url.getProtocol().toLowerCase();
                if ("byIP".equalsIgnoreCase(str)) {
                    try {
                        hostAddress = InetAddress.getByName(url.getHost()).getHostAddress();
                    } catch (UnknownHostException e) {
                        Fetcher.LOG.warn("Unable to resolve: " + url.getHost() + ", skipping.");
                        return null;
                    }
                } else if ("byDomain".equalsIgnoreCase(str)) {
                    hostAddress = URLUtil.getDomainName(url);
                    if (hostAddress == null) {
                        Fetcher.LOG.warn("Unknown domain for url: " + text + ", using URL string as key");
                        hostAddress = url.toExternalForm();
                    }
                } else {
                    hostAddress = url.getHost();
                    if (hostAddress == null) {
                        Fetcher.LOG.warn("Unknown host for url: " + text + ", using URL string as key");
                        hostAddress = url.toExternalForm();
                    }
                }
                return new FetchItem(text, url, crawlDatum, lowerCase + SecUtil.PROTOCOL_DELIM + hostAddress.toLowerCase(), i);
            } catch (Exception e2) {
                Fetcher.LOG.warn("Cannot parse url: " + text, (Throwable) e2);
                return null;
            }
        }

        public CrawlDatum getDatum() {
            return this.datum;
        }

        public String getQueueID() {
            return this.queueID;
        }

        public Text getUrl() {
            return this.url;
        }

        public URL getURL2() {
            return this.u;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/fetcher/Fetcher$FetchItemQueue.class */
    public static class FetchItemQueue {
        List<FetchItem> queue = Collections.synchronizedList(new LinkedList());
        Set<FetchItem> inProgress = Collections.synchronizedSet(new HashSet());
        AtomicLong nextFetchTime = new AtomicLong();
        AtomicInteger exceptionCounter = new AtomicInteger();
        long crawlDelay;
        long minCrawlDelay;
        int maxThreads;
        Configuration conf;

        public FetchItemQueue(Configuration configuration, int i, long j, long j2) {
            this.conf = configuration;
            this.maxThreads = i;
            this.crawlDelay = j;
            this.minCrawlDelay = j2;
            setEndTime(System.currentTimeMillis() - j);
        }

        public synchronized int emptyQueue() {
            int size = this.queue.size();
            this.queue.clear();
            return size;
        }

        public int getQueueSize() {
            return this.queue.size();
        }

        public int getInProgressSize() {
            return this.inProgress.size();
        }

        public int incrementExceptionCounter() {
            return this.exceptionCounter.incrementAndGet();
        }

        public void finishFetchItem(FetchItem fetchItem, boolean z) {
            if (fetchItem != null) {
                this.inProgress.remove(fetchItem);
                setEndTime(System.currentTimeMillis(), z);
            }
        }

        public void addFetchItem(FetchItem fetchItem) {
            if (fetchItem == null) {
                return;
            }
            this.queue.add(fetchItem);
        }

        public void addInProgressFetchItem(FetchItem fetchItem) {
            if (fetchItem == null) {
                return;
            }
            this.inProgress.add(fetchItem);
        }

        public FetchItem getFetchItem() {
            if (this.inProgress.size() >= this.maxThreads) {
                return null;
            }
            if (this.nextFetchTime.get() > System.currentTimeMillis()) {
                return null;
            }
            FetchItem fetchItem = null;
            if (this.queue.size() == 0) {
                return null;
            }
            try {
                fetchItem = this.queue.remove(0);
                this.inProgress.add(fetchItem);
            } catch (Exception e) {
                Fetcher.LOG.error("Cannot remove FetchItem from queue or cannot add it to inProgress queue", (Throwable) e);
            }
            return fetchItem;
        }

        public synchronized void dump() {
            Fetcher.LOG.info("  maxThreads    = " + this.maxThreads);
            Fetcher.LOG.info("  inProgress    = " + this.inProgress.size());
            Fetcher.LOG.info("  crawlDelay    = " + this.crawlDelay);
            Fetcher.LOG.info("  minCrawlDelay = " + this.minCrawlDelay);
            Fetcher.LOG.info("  nextFetchTime = " + this.nextFetchTime.get());
            Fetcher.LOG.info("  now           = " + System.currentTimeMillis());
            for (int i = 0; i < this.queue.size(); i++) {
                Fetcher.LOG.info("  " + i + ". " + this.queue.get(i).url);
            }
        }

        private void setEndTime(long j) {
            setEndTime(j, false);
        }

        private void setEndTime(long j, boolean z) {
            if (z) {
                this.nextFetchTime.set(j);
            } else {
                this.nextFetchTime.set(j + (this.maxThreads > 1 ? this.minCrawlDelay : this.crawlDelay));
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/fetcher/Fetcher$FetchItemQueues.class */
    public static class FetchItemQueues {
        public static final String DEFAULT_ID = "default";
        Map<String, FetchItemQueue> queues = new HashMap();
        AtomicInteger totalSize = new AtomicInteger(0);
        int maxThreads;
        long crawlDelay;
        long minCrawlDelay;
        long timelimit;
        int maxExceptionsPerQueue;
        Configuration conf;
        public static final String QUEUE_MODE_HOST = "byHost";
        public static final String QUEUE_MODE_DOMAIN = "byDomain";
        public static final String QUEUE_MODE_IP = "byIP";
        String queueMode;

        public FetchItemQueues(Configuration configuration) {
            this.timelimit = -1L;
            this.maxExceptionsPerQueue = -1;
            this.conf = configuration;
            this.maxThreads = configuration.getInt("fetcher.threads.per.queue", 1);
            this.queueMode = configuration.get("fetcher.queue.mode", "byHost");
            if (!this.queueMode.equals("byIP") && !this.queueMode.equals("byDomain") && !this.queueMode.equals("byHost")) {
                Fetcher.LOG.error("Unknown partition mode : " + this.queueMode + " - forcing to byHost");
                this.queueMode = "byHost";
            }
            Fetcher.LOG.info("Using queue mode : " + this.queueMode);
            this.crawlDelay = configuration.getFloat("fetcher.server.delay", 1.0f) * 1000.0f;
            this.minCrawlDelay = configuration.getFloat("fetcher.server.min.delay", 0.0f) * 1000.0f;
            this.timelimit = configuration.getLong("fetcher.timelimit", -1L);
            this.maxExceptionsPerQueue = configuration.getInt("fetcher.max.exceptions.per.queue", -1);
        }

        public int getTotalSize() {
            return this.totalSize.get();
        }

        public int getQueueCount() {
            return this.queues.size();
        }

        public void addFetchItem(Text text, CrawlDatum crawlDatum) {
            FetchItem create = FetchItem.create(text, crawlDatum, this.queueMode);
            if (create != null) {
                addFetchItem(create);
            }
        }

        public synchronized void addFetchItem(FetchItem fetchItem) {
            getFetchItemQueue(fetchItem.queueID).addFetchItem(fetchItem);
            this.totalSize.incrementAndGet();
        }

        public void finishFetchItem(FetchItem fetchItem) {
            finishFetchItem(fetchItem, false);
        }

        public void finishFetchItem(FetchItem fetchItem, boolean z) {
            FetchItemQueue fetchItemQueue = this.queues.get(fetchItem.queueID);
            if (fetchItemQueue == null) {
                Fetcher.LOG.warn("Attempting to finish item from unknown queue: " + fetchItem);
            } else {
                fetchItemQueue.finishFetchItem(fetchItem, z);
            }
        }

        public synchronized FetchItemQueue getFetchItemQueue(String str) {
            FetchItemQueue fetchItemQueue = this.queues.get(str);
            if (fetchItemQueue == null) {
                fetchItemQueue = new FetchItemQueue(this.conf, this.maxThreads, this.crawlDelay, this.minCrawlDelay);
                this.queues.put(str, fetchItemQueue);
            }
            return fetchItemQueue;
        }

        public synchronized FetchItem getFetchItem() {
            Iterator<Map.Entry<String, FetchItemQueue>> it = this.queues.entrySet().iterator();
            while (it.hasNext()) {
                FetchItemQueue value = it.next().getValue();
                if (value.getQueueSize() == 0 && value.getInProgressSize() == 0) {
                    it.remove();
                } else {
                    FetchItem fetchItem = value.getFetchItem();
                    if (fetchItem != null) {
                        this.totalSize.decrementAndGet();
                        return fetchItem;
                    }
                }
            }
            return null;
        }

        public synchronized int checkTimelimit() {
            int i = 0;
            if (System.currentTimeMillis() >= this.timelimit && this.timelimit != -1) {
                i = emptyQueues();
                if (this.totalSize.get() != 0 && this.queues.size() == 0) {
                    this.totalSize.set(0);
                }
            }
            return i;
        }

        public synchronized int emptyQueues() {
            int i = 0;
            for (String str : this.queues.keySet()) {
                FetchItemQueue fetchItemQueue = this.queues.get(str);
                if (fetchItemQueue.getQueueSize() != 0) {
                    Fetcher.LOG.info("* queue: " + str + " >> dropping! ");
                    int emptyQueue = fetchItemQueue.emptyQueue();
                    for (int i2 = 0; i2 < emptyQueue; i2++) {
                        this.totalSize.decrementAndGet();
                    }
                    i += emptyQueue;
                }
            }
            return i;
        }

        public synchronized int checkExceptionThreshold(String str) {
            FetchItemQueue fetchItemQueue = this.queues.get(str);
            if (fetchItemQueue == null || fetchItemQueue.getQueueSize() == 0) {
                return 0;
            }
            int incrementExceptionCounter = fetchItemQueue.incrementExceptionCounter();
            if (this.maxExceptionsPerQueue == -1 || incrementExceptionCounter < this.maxExceptionsPerQueue) {
                return 0;
            }
            int emptyQueue = fetchItemQueue.emptyQueue();
            Fetcher.LOG.info("* queue: " + str + " >> removed " + emptyQueue + " URLs from queue because " + incrementExceptionCounter + " exceptions occurred");
            for (int i = 0; i < emptyQueue; i++) {
                this.totalSize.decrementAndGet();
            }
            return emptyQueue;
        }

        public synchronized void dump() {
            for (String str : this.queues.keySet()) {
                FetchItemQueue fetchItemQueue = this.queues.get(str);
                if (fetchItemQueue.getQueueSize() != 0) {
                    Fetcher.LOG.info("* queue: " + str);
                    fetchItemQueue.dump();
                }
            }
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/fetcher/Fetcher$FetcherThread.class */
    private class FetcherThread extends Thread {
        private Configuration conf;
        private URLFilters urlFilters;
        private ScoringFilters scfilters;
        private ParseUtil parseUtil;
        private URLNormalizers normalizers;
        private ProtocolFactory protocolFactory;
        private long maxCrawlDelay;
        private String queueMode;
        private int maxRedirect;
        private String reprUrl;
        private boolean redirecting;
        private int redirectCount;
        private boolean ignoreExternalLinks;
        private int maxOutlinksPerPage;
        private final int maxOutlinks;
        private final int interval;
        private int maxOutlinkDepth;
        private int maxOutlinkDepthNumLinks;
        private int outlinksDepthDivisor;
        private boolean skipTruncated;

        public FetcherThread(Configuration configuration) {
            setDaemon(true);
            setName("FetcherThread");
            this.conf = configuration;
            this.urlFilters = new URLFilters(configuration);
            this.scfilters = new ScoringFilters(configuration);
            this.parseUtil = new ParseUtil(configuration);
            this.skipTruncated = configuration.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
            this.protocolFactory = new ProtocolFactory(configuration);
            this.normalizers = new URLNormalizers(configuration, URLNormalizers.SCOPE_FETCHER);
            this.maxCrawlDelay = configuration.getInt("fetcher.max.crawl.delay", 30) * 1000;
            this.queueMode = configuration.get("fetcher.queue.mode", "byHost");
            if (!this.queueMode.equals("byIP") && !this.queueMode.equals("byDomain") && !this.queueMode.equals("byHost")) {
                Fetcher.LOG.error("Unknown partition mode : " + this.queueMode + " - forcing to byHost");
                this.queueMode = "byHost";
            }
            Fetcher.LOG.info("Using queue mode : " + this.queueMode);
            this.maxRedirect = configuration.getInt("http.redirect.max", 3);
            this.ignoreExternalLinks = configuration.getBoolean("db.ignore.external.links", false);
            this.maxOutlinksPerPage = configuration.getInt("db.max.outlinks.per.page", 100);
            this.maxOutlinks = this.maxOutlinksPerPage < 0 ? Integer.MAX_VALUE : this.maxOutlinksPerPage;
            this.interval = configuration.getInt("db.fetch.interval.default", 2592000);
            this.ignoreExternalLinks = configuration.getBoolean("db.ignore.external.links", false);
            this.maxOutlinkDepth = configuration.getInt("fetcher.follow.outlinks.depth", -1);
            this.maxOutlinkDepthNumLinks = configuration.getInt("fetcher.follow.outlinks.num.links", 4);
            this.outlinksDepthDivisor = configuration.getInt("fetcher.follow.outlinks.depth.divisor", 2);
        }

        @Override // java.lang.Thread, java.lang.Runnable
        public void run() {
            int i;
            boolean z;
            Fetcher.this.activeThreads.incrementAndGet();
            FetchItem fetchItem = null;
            while (true) {
                try {
                    try {
                        fetchItem = Fetcher.this.fetchQueues.getFetchItem();
                        if (fetchItem != null) {
                            Fetcher.this.lastRequestStart.set(System.currentTimeMillis());
                            Text text = (Text) fetchItem.datum.getMetaData().get((Object) Nutch.WRITABLE_REPR_URL_KEY);
                            if (text == null) {
                                this.reprUrl = fetchItem.url.toString();
                            } else {
                                this.reprUrl = text.toString();
                            }
                            try {
                                this.redirecting = false;
                                this.redirectCount = 0;
                            } catch (Throwable th) {
                                Fetcher.this.fetchQueues.finishFetchItem(fetchItem);
                                logError(fetchItem.url, StringUtils.stringifyException(th));
                                output(fetchItem.url, fetchItem.datum, null, ProtocolStatus.STATUS_FAILED, 34);
                            }
                            do {
                                if (Fetcher.LOG.isInfoEnabled()) {
                                    Fetcher.LOG.info("fetching " + fetchItem.url);
                                }
                                if (Fetcher.LOG.isDebugEnabled()) {
                                    Fetcher.LOG.debug("redirectCount=" + this.redirectCount);
                                }
                                this.redirecting = false;
                                Protocol protocol = this.protocolFactory.getProtocol(fetchItem.url.toString());
                                RobotRules robotRules = protocol.getRobotRules(fetchItem.url, fetchItem.datum);
                                if (robotRules.isAllowed(fetchItem.u)) {
                                    if (robotRules.getCrawlDelay() > 0) {
                                        if (robotRules.getCrawlDelay() > this.maxCrawlDelay) {
                                            Fetcher.this.fetchQueues.finishFetchItem(fetchItem, true);
                                            Fetcher.LOG.debug("Crawl-Delay for " + fetchItem.url + " too long (" + robotRules.getCrawlDelay() + "), skipping");
                                            output(fetchItem.url, fetchItem.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, 37);
                                            Fetcher.this.reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1L);
                                        } else {
                                            Fetcher.this.fetchQueues.getFetchItemQueue(fetchItem.queueID).crawlDelay = robotRules.getCrawlDelay();
                                        }
                                    }
                                    ProtocolOutput protocolOutput = protocol.getProtocolOutput(fetchItem.url, fetchItem.datum);
                                    ProtocolStatus status = protocolOutput.getStatus();
                                    Content content = protocolOutput.getContent();
                                    Fetcher.this.fetchQueues.finishFetchItem(fetchItem);
                                    String text2 = fetchItem.url.toString();
                                    Fetcher.this.reporter.incrCounter("FetcherStatus", status.getName(), 1L);
                                    switch (status.getCode()) {
                                        case 1:
                                            ParseStatus output = output(fetchItem.url, fetchItem.datum, content, status, 33, fetchItem.outlinkDepth);
                                            Fetcher.this.updateStatus(content.getContent().length);
                                            if (output != null && output.isSuccess() && output.getMinorCode() == 100) {
                                                Text handleRedirect = handleRedirect(fetchItem.url, fetchItem.datum, text2, output.getMessage(), Integer.valueOf(output.getArgs()[1]).intValue() < 5, "content");
                                                if (handleRedirect != null) {
                                                    CrawlDatum crawlDatum = new CrawlDatum(1, fetchItem.datum.getFetchInterval(), fetchItem.datum.getScore());
                                                    crawlDatum.getMetaData().putAll(fetchItem.datum.getMetaData());
                                                    this.scfilters.initialScore(handleRedirect, crawlDatum);
                                                    if (this.reprUrl != null) {
                                                        crawlDatum.getMetaData().put((Writable) Nutch.WRITABLE_REPR_URL_KEY, (Writable) new Text(this.reprUrl));
                                                    }
                                                    fetchItem = FetchItem.create(handleRedirect, crawlDatum, this.queueMode);
                                                    if (fetchItem != null) {
                                                        Fetcher.this.fetchQueues.getFetchItemQueue(fetchItem.queueID).addInProgressFetchItem(fetchItem);
                                                    } else {
                                                        this.redirecting = false;
                                                        Fetcher.this.reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1L);
                                                    }
                                                }
                                                break;
                                            }
                                            break;
                                        case 2:
                                        case 3:
                                        case 4:
                                        case 5:
                                        case 6:
                                        case 7:
                                        case 8:
                                        case 9:
                                        case 10:
                                        case 19:
                                        case 20:
                                        default:
                                            if (Fetcher.LOG.isWarnEnabled()) {
                                                Fetcher.LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                                            }
                                            output(fetchItem.url, fetchItem.datum, null, status, 34);
                                            break;
                                        case 11:
                                        case 14:
                                        case 17:
                                        case 18:
                                            output(fetchItem.url, fetchItem.datum, null, status, 37);
                                            break;
                                        case 12:
                                        case 13:
                                            if (status.getCode() == 12) {
                                                i = 36;
                                                z = false;
                                            } else {
                                                i = 35;
                                                z = true;
                                            }
                                            output(fetchItem.url, fetchItem.datum, content, status, i);
                                            Text handleRedirect2 = handleRedirect(fetchItem.url, fetchItem.datum, text2, status.getMessage(), z, "protocol");
                                            if (handleRedirect2 == null) {
                                                this.redirecting = false;
                                                break;
                                            } else {
                                                CrawlDatum crawlDatum2 = new CrawlDatum(1, fetchItem.datum.getFetchInterval(), fetchItem.datum.getScore());
                                                crawlDatum2.getMetaData().putAll(fetchItem.datum.getMetaData());
                                                this.scfilters.initialScore(handleRedirect2, crawlDatum2);
                                                if (this.reprUrl != null) {
                                                    crawlDatum2.getMetaData().put((Writable) Nutch.WRITABLE_REPR_URL_KEY, (Writable) new Text(this.reprUrl));
                                                }
                                                fetchItem = FetchItem.create(handleRedirect2, crawlDatum2, this.queueMode);
                                                if (fetchItem != null) {
                                                    Fetcher.this.fetchQueues.getFetchItemQueue(fetchItem.queueID).addInProgressFetchItem(fetchItem);
                                                } else {
                                                    this.redirecting = false;
                                                    Fetcher.this.reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1L);
                                                }
                                                break;
                                            }
                                        case 15:
                                        case 23:
                                            break;
                                        case 16:
                                            logError(fetchItem.url, status.getMessage());
                                            int checkExceptionThreshold = Fetcher.this.fetchQueues.checkExceptionThreshold(fetchItem.getQueueID());
                                            if (checkExceptionThreshold != 0) {
                                                Fetcher.this.reporter.incrCounter("FetcherStatus", "AboveExceptionThresholdInQueue", checkExceptionThreshold);
                                                break;
                                            }
                                            break;
                                        case 21:
                                            output(fetchItem.url, fetchItem.datum, null, status, 38);
                                            break;
                                        case 22:
                                            Fetcher.this.fetchQueues.addFetchItem(fetchItem);
                                            break;
                                    }
                                    output(fetchItem.url, fetchItem.datum, null, status, 34);
                                    if (this.redirecting && this.redirectCount > this.maxRedirect) {
                                        Fetcher.this.fetchQueues.finishFetchItem(fetchItem);
                                        if (Fetcher.LOG.isInfoEnabled()) {
                                            Fetcher.LOG.info(" - redirect count exceeded " + fetchItem.url);
                                        }
                                        output(fetchItem.url, fetchItem.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, 37);
                                    }
                                } else {
                                    Fetcher.this.fetchQueues.finishFetchItem(fetchItem, true);
                                    if (Fetcher.LOG.isDebugEnabled()) {
                                        Fetcher.LOG.debug("Denied by robots.txt: " + fetchItem.url);
                                    }
                                    output(fetchItem.url, fetchItem.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, 37);
                                    Fetcher.this.reporter.incrCounter("FetcherStatus", "robots_denied", 1L);
                                }
                                if (this.redirecting) {
                                }
                            } while (this.redirectCount <= this.maxRedirect);
                        } else {
                            if (!Fetcher.this.feeder.isAlive() && Fetcher.this.fetchQueues.getTotalSize() <= 0) {
                                if (fetchItem != null) {
                                    Fetcher.this.fetchQueues.finishFetchItem(fetchItem);
                                }
                                Fetcher.this.activeThreads.decrementAndGet();
                                Fetcher.LOG.info("-finishing thread " + getName() + ", activeThreads=" + Fetcher.this.activeThreads);
                                return;
                            }
                            Fetcher.LOG.debug(getName() + " spin-waiting ...");
                            Fetcher.this.spinWaiting.incrementAndGet();
                            try {
                                Thread.sleep(500L);
                            } catch (Exception e) {
                            }
                            Fetcher.this.spinWaiting.decrementAndGet();
                        }
                    } catch (Throwable th2) {
                        if (Fetcher.LOG.isErrorEnabled()) {
                            Fetcher.LOG.error("fetcher caught:" + th2.toString());
                        }
                        if (fetchItem != null) {
                            Fetcher.this.fetchQueues.finishFetchItem(fetchItem);
                        }
                        Fetcher.this.activeThreads.decrementAndGet();
                        Fetcher.LOG.info("-finishing thread " + getName() + ", activeThreads=" + Fetcher.this.activeThreads);
                        return;
                    }
                } catch (Throwable th3) {
                    if (fetchItem != null) {
                        Fetcher.this.fetchQueues.finishFetchItem(fetchItem);
                    }
                    Fetcher.this.activeThreads.decrementAndGet();
                    Fetcher.LOG.info("-finishing thread " + getName() + ", activeThreads=" + Fetcher.this.activeThreads);
                    throw th3;
                }
            }
        }

        private Text handleRedirect(Text text, CrawlDatum crawlDatum, String str, String str2, boolean z, String str3) throws MalformedURLException, URLFilterException {
            String filter = this.urlFilters.filter(this.normalizers.normalize(str2, URLNormalizers.SCOPE_FETCHER));
            if (this.ignoreExternalLinks) {
                try {
                    if (!new URL(str).getHost().toLowerCase().equals(new URL(filter).getHost().toLowerCase())) {
                        if (!Fetcher.LOG.isDebugEnabled()) {
                            return null;
                        }
                        Fetcher.LOG.debug(" - ignoring redirect " + str3 + " from " + str + " to " + filter + " because external links are ignored");
                        return null;
                    }
                } catch (MalformedURLException e) {
                }
            }
            if (filter == null || filter.equals(str)) {
                if (!Fetcher.LOG.isDebugEnabled()) {
                    return null;
                }
                Fetcher.LOG.debug(" - " + str3 + " redirect skipped: " + (filter != null ? "to same url" : ResourceEvent.ACTION_FILTERED));
                return null;
            }
            this.reprUrl = URLUtil.chooseRepr(this.reprUrl, filter, z);
            Text text2 = new Text(filter);
            if (this.maxRedirect > 0) {
                this.redirecting = true;
                this.redirectCount++;
                if (Fetcher.LOG.isDebugEnabled()) {
                    Fetcher.LOG.debug(" - " + str3 + " redirect to " + text2 + " (fetching now)");
                }
                return text2;
            }
            CrawlDatum crawlDatum2 = new CrawlDatum(67, crawlDatum.getFetchInterval(), crawlDatum.getScore());
            crawlDatum2.getMetaData().putAll(crawlDatum.getMetaData());
            try {
                this.scfilters.initialScore(text2, crawlDatum2);
            } catch (ScoringFilterException e2) {
                e2.printStackTrace();
            }
            if (this.reprUrl != null) {
                crawlDatum2.getMetaData().put((Writable) Nutch.WRITABLE_REPR_URL_KEY, (Writable) new Text(this.reprUrl));
            }
            output(text2, crawlDatum2, null, null, 67);
            if (!Fetcher.LOG.isDebugEnabled()) {
                return null;
            }
            Fetcher.LOG.debug(" - " + str3 + " redirect to " + text2 + " (fetching later)");
            return null;
        }

        private void logError(Text text, String str) {
            if (Fetcher.LOG.isInfoEnabled()) {
                Fetcher.LOG.info("fetch of " + text + " failed with: " + str);
            }
            Fetcher.this.errors.incrementAndGet();
        }

        private ParseStatus output(Text text, CrawlDatum crawlDatum, Content content, ProtocolStatus protocolStatus, int i) {
            return output(text, crawlDatum, content, protocolStatus, i, 0);
        }

        private ParseStatus output(Text text, CrawlDatum crawlDatum, Content content, ProtocolStatus protocolStatus, int i, int i2) {
            Parse parse;
            String str;
            crawlDatum.setStatus(i);
            crawlDatum.setFetchTime(System.currentTimeMillis());
            if (protocolStatus != null) {
                crawlDatum.getMetaData().put((Writable) Nutch.WRITABLE_PROTO_STATUS_KEY, (Writable) protocolStatus);
            }
            ParseResult parseResult = null;
            if (content != null) {
                Metadata metadata = content.getMetadata();
                if (content.getContentType() != null) {
                    crawlDatum.getMetaData().put((Writable) new Text("Content-Type"), (Writable) new Text(content.getContentType()));
                }
                metadata.set(Nutch.SEGMENT_NAME_KEY, Fetcher.this.segmentName);
                try {
                    this.scfilters.passScoreBeforeParsing(text, crawlDatum, content);
                } catch (Exception e) {
                    if (Fetcher.LOG.isWarnEnabled()) {
                        Fetcher.LOG.warn("Couldn't pass score, url " + text + " (" + e + ")");
                    }
                }
                if (Fetcher.this.parsing && i == 33) {
                    if (!this.skipTruncated || (this.skipTruncated && !ParseSegment.isTruncated(content))) {
                        try {
                            parseResult = this.parseUtil.parse(content);
                        } catch (Exception e2) {
                            Fetcher.LOG.warn("Error parsing: " + text + ": " + StringUtils.stringifyException(e2));
                        }
                    }
                    if (parseResult == null) {
                        crawlDatum.setSignature(SignatureFactory.getSignature(Fetcher.this.getConf()).calculate(content, new ParseStatus().getEmptyParse(this.conf)));
                    }
                }
                content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(i));
            }
            try {
                Fetcher.this.output.collect(text, new NutchWritable(crawlDatum));
                if (content != null && Fetcher.this.storingContent) {
                    Fetcher.this.output.collect(text, new NutchWritable(content));
                }
                if (parseResult != null) {
                    Iterator<Map.Entry<Text, Parse>> it = parseResult.iterator();
                    while (it.hasNext()) {
                        Map.Entry<Text, Parse> next = it.next();
                        Text key = next.getKey();
                        Parse value = next.getValue();
                        ParseStatus status = value.getData().getStatus();
                        ParseData data = value.getData();
                        if (!status.isSuccess()) {
                            Fetcher.LOG.warn("Error parsing: " + text + ": " + status);
                            value = status.getEmptyParse(Fetcher.this.getConf());
                        }
                        byte[] calculate = SignatureFactory.getSignature(Fetcher.this.getConf()).calculate(content, value);
                        data.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, Fetcher.this.segmentName);
                        data.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(calculate));
                        data.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(crawlDatum.getFetchTime()));
                        if (key.equals(text)) {
                            crawlDatum.setSignature(calculate);
                        }
                        try {
                            this.scfilters.passScoreAfterParsing(key, content, value);
                        } catch (Exception e3) {
                            if (Fetcher.LOG.isWarnEnabled()) {
                                Fetcher.LOG.warn("Couldn't pass score, url " + text + " (" + e3 + ")");
                            }
                        }
                        Outlink[] outlinks = data.getOutlinks();
                        int min = Math.min(this.maxOutlinks, outlinks.length);
                        if (this.ignoreExternalLinks) {
                            try {
                                str = new URL(key.toString()).getHost().toLowerCase();
                            } catch (MalformedURLException e4) {
                                str = null;
                            }
                        } else {
                            str = null;
                        }
                        int i3 = 0;
                        ArrayList arrayList = new ArrayList(min);
                        HashSet hashSet = new HashSet(min);
                        for (int i4 = 0; i4 < outlinks.length && i3 < min; i4++) {
                            String filterNormalize = ParseOutputFormat.filterNormalize(key.toString(), outlinks[i4].getToUrl(), str, this.ignoreExternalLinks, this.urlFilters, this.normalizers);
                            if (filterNormalize != null) {
                                i3++;
                                outlinks[i4].setUrl(filterNormalize);
                                arrayList.add(outlinks[i4]);
                                hashSet.add(filterNormalize);
                            }
                        }
                        if (this.maxOutlinkDepth > 0 && i2 < this.maxOutlinkDepth) {
                            Fetcher.this.reporter.incrCounter("FetcherOutlinks", "outlinks_detected", hashSet.size());
                            Iterator it2 = hashSet.iterator();
                            for (int i5 = 0; it2.hasNext() && i5 < this.maxOutlinkDepthNumLinks; i5++) {
                                Fetcher.this.reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1L);
                                Fetcher.this.fetchQueues.addFetchItem(FetchItem.create(new Text((String) it2.next()), new CrawlDatum(67, this.interval), this.queueMode, i2 + 1));
                            }
                        }
                        data.setOutlinks((Outlink[]) arrayList.toArray(new Outlink[arrayList.size()]));
                        Fetcher.this.output.collect(key, new NutchWritable(new ParseImpl(new ParseText(value.getText()), data, value.isCanonical())));
                    }
                }
            } catch (IOException e5) {
                if (Fetcher.LOG.isErrorEnabled()) {
                    Fetcher.LOG.error("fetcher caught:" + e5.toString());
                }
            }
            if (parseResult == null || parseResult.isEmpty() || (parse = parseResult.get(content.getUrl())) == null) {
                return null;
            }
            Fetcher.this.reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[parse.getData().getStatus().getMajorCode()], 1L);
            return parse.getData().getStatus();
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/fetcher/Fetcher$InputFormat.class */
    public static class InputFormat extends SequenceFileInputFormat<Text, CrawlDatum> {
        @Override // org.apache.hadoop.mapred.FileInputFormat, org.apache.hadoop.mapred.InputFormat
        public InputSplit[] getSplits(JobConf jobConf, int i) throws IOException {
            FileStatus[] listStatus = listStatus(jobConf);
            FileSplit[] fileSplitArr = new FileSplit[listStatus.length];
            for (int i2 = 0; i2 < listStatus.length; i2++) {
                FileStatus fileStatus = listStatus[i2];
                fileSplitArr[i2] = new FileSplit(fileStatus.getPath(), 0L, fileStatus.getLen(), (String[]) null);
            }
            return fileSplitArr;
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/fetcher/Fetcher$QueueFeeder.class */
    private static class QueueFeeder extends Thread {
        private RecordReader<Text, CrawlDatum> reader;
        private FetchItemQueues queues;
        private int size;
        private long timelimit = -1;

        public QueueFeeder(RecordReader<Text, CrawlDatum> recordReader, FetchItemQueues fetchItemQueues, int i) {
            this.reader = recordReader;
            this.queues = fetchItemQueues;
            this.size = i;
            setDaemon(true);
            setName("QueueFeeder");
        }

        public void setTimeLimit(long j) {
            this.timelimit = j;
        }

        @Override // java.lang.Thread, java.lang.Runnable
        public void run() {
            boolean z = true;
            int i = 0;
            int i2 = 0;
            while (z) {
                if (System.currentTimeMillis() < this.timelimit || this.timelimit == -1) {
                    int totalSize = this.size - this.queues.getTotalSize();
                    if (totalSize <= 0) {
                        try {
                            Thread.sleep(1000L);
                        } catch (Exception e) {
                        }
                    } else {
                        Fetcher.LOG.debug("-feeding " + totalSize + " input urls ...");
                        while (totalSize > 0 && z) {
                            try {
                                Text text = new Text();
                                CrawlDatum crawlDatum = new CrawlDatum();
                                z = this.reader.next(text, crawlDatum);
                                if (z) {
                                    this.queues.addFetchItem(text, crawlDatum);
                                    i++;
                                    totalSize--;
                                }
                            } catch (IOException e2) {
                                Fetcher.LOG.error("QueueFeeder error reading input, record " + i, (Throwable) e2);
                                return;
                            }
                        }
                    }
                } else {
                    try {
                        z = this.reader.next(new Text(), new CrawlDatum());
                        i2++;
                    } catch (IOException e3) {
                        Fetcher.LOG.error("QueueFeeder error reading input, record " + i, (Throwable) e3);
                        return;
                    }
                }
            }
            Fetcher.LOG.info("QueueFeeder finished: total " + i + " records + hit by time limit :" + i2);
        }
    }

    public Fetcher() {
        super(null);
        this.activeThreads = new AtomicInteger(0);
        this.spinWaiting = new AtomicInteger(0);
        this.start = System.currentTimeMillis();
        this.lastRequestStart = new AtomicLong(this.start);
        this.bytes = new AtomicLong(0L);
        this.pages = new AtomicInteger(0);
        this.errors = new AtomicInteger(0);
    }

    public Fetcher(Configuration configuration) {
        super(configuration);
        this.activeThreads = new AtomicInteger(0);
        this.spinWaiting = new AtomicInteger(0);
        this.start = System.currentTimeMillis();
        this.lastRequestStart = new AtomicLong(this.start);
        this.bytes = new AtomicLong(0L);
        this.pages = new AtomicInteger(0);
        this.errors = new AtomicInteger(0);
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void updateStatus(int i) throws IOException {
        this.pages.incrementAndGet();
        this.bytes.addAndGet(i);
    }

    private void reportStatus(int i, int i2) throws IOException {
        this.reporter.setStatus(this.activeThreads + " threads, " + this.fetchQueues.getQueueCount() + " queues, " + this.fetchQueues.getTotalSize() + " URLs queued, " + this.pages + " pages, " + this.errors + " errors, " + (Math.round((this.pages.get() * 10.0f) / ((float) ((System.currentTimeMillis() - this.start) / 1000))) / 10) + " (" + i + ") pages/s, " + Math.round(((((float) this.bytes.get()) * 8.0f) / 1000.0f) / ((float) r0)) + " (" + i2 + ") kbits/s, ");
    }

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        setConf(jobConf);
        this.segmentName = jobConf.get(Nutch.SEGMENT_NAME_KEY);
        this.storingContent = isStoringContent(jobConf);
        this.parsing = isParsing(jobConf);
    }

    public void close() {
    }

    public static boolean isParsing(Configuration configuration) {
        return configuration.getBoolean("fetcher.parse", true);
    }

    public static boolean isStoringContent(Configuration configuration) {
        return configuration.getBoolean("fetcher.store.content", true);
    }

    @Override // org.apache.hadoop.mapred.MapRunnable
    public void run(RecordReader<Text, CrawlDatum> recordReader, OutputCollector<Text, NutchWritable> outputCollector, Reporter reporter) throws IOException {
        int checkTimelimit;
        this.output = outputCollector;
        this.reporter = reporter;
        this.fetchQueues = new FetchItemQueues(getConf());
        int i = getConf().getInt("fetcher.threads.fetch", 10);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: threads: " + i);
        }
        int i2 = getConf().getInt("fetcher.threads.timeout.divisor", 2);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: time-out divisor: " + i2);
        }
        this.feeder = new QueueFeeder(recordReader, this.fetchQueues, i * getConf().getInt("fetcher.queue.depth.multiplier", 50));
        long j = getConf().getLong("fetcher.timelimit", -1L);
        if (j != -1) {
            this.feeder.setTimeLimit(j);
        }
        this.feeder.start();
        getConf().setBoolean(Protocol.CHECK_BLOCKING, false);
        getConf().setBoolean(Protocol.CHECK_ROBOTS, false);
        for (int i3 = 0; i3 < i; i3++) {
            new FetcherThread(getConf()).start();
        }
        long j2 = getConf().getInt("mapred.task.timeout", 600000) / i2;
        int i4 = 0;
        int i5 = getConf().getInt("fetcher.throughput.threshold.pages", -1);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: throughput threshold: " + i5);
        }
        int i6 = getConf().getInt("fetcher.throughput.threshold.retries", 5);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: throughput threshold retries: " + i6);
        }
        long j3 = getConf().getLong("fetcher.throughput.threshold.check.after", -1L);
        do {
            int i7 = this.pages.get();
            int i8 = (int) this.bytes.get();
            try {
                Thread.sleep(1000L);
            } catch (InterruptedException e) {
            }
            int i9 = this.pages.get() - i7;
            int i10 = ((int) this.bytes.get()) - i8;
            reporter.incrCounter("FetcherStatus", "bytes_downloaded", i10);
            reportStatus(i9, i10);
            LOG.info("-activeThreads=" + this.activeThreads + ", spinWaiting=" + this.spinWaiting.get() + ", fetchQueues.totalSize=" + this.fetchQueues.getTotalSize());
            if (!this.feeder.isAlive() && this.fetchQueues.getTotalSize() < 5) {
                this.fetchQueues.dump();
            }
            if (j3 < System.currentTimeMillis() && i5 != -1 && i9 < i5) {
                i4++;
                LOG.warn(Integer.toString(i4) + ": dropping below configured threshold of " + Integer.toString(i5) + " pages per second");
                if (i4 == i6) {
                    LOG.warn("Dropped below threshold too many times, killing!");
                    i5 = -1;
                    int emptyQueues = this.fetchQueues.emptyQueues();
                    if (emptyQueues != 0) {
                        reporter.incrCounter("FetcherStatus", "hitByThrougputThreshold", emptyQueues);
                    }
                }
            }
            if (!this.feeder.isAlive() && (checkTimelimit = this.fetchQueues.checkTimelimit()) != 0) {
                reporter.incrCounter("FetcherStatus", "hitByTimeLimit", checkTimelimit);
            }
            if (System.currentTimeMillis() - this.lastRequestStart.get() > j2) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Aborting with " + this.activeThreads + " hung threads.");
                    return;
                }
                return;
            }
        } while (this.activeThreads.get() > 0);
        LOG.info("-activeThreads=" + this.activeThreads);
    }

    public void fetch(Path path, int i) throws IOException {
        checkConfiguration();
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
            LOG.info("Fetcher: segment: " + path);
        }
        long j = getConf().getLong("fetcher.timelimit.mins", -1L);
        if (j != -1) {
            long currentTimeMillis2 = System.currentTimeMillis() + (j * 60 * 1000);
            LOG.info("Fetcher Timelimit set for : " + currentTimeMillis2);
            getConf().setLong("fetcher.timelimit", currentTimeMillis2);
        }
        getConf().setLong("fetcher.throughput.threshold.check.after", System.currentTimeMillis() + (getConf().getLong("fetcher.throughput.threshold.check.after", 10L) * 60 * 1000));
        int i2 = getConf().getInt("fetcher.follow.outlinks.depth", -1);
        if (i2 > 0) {
            LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(i2));
            int i3 = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
            int i4 = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);
            int i5 = 0;
            for (int i6 = 0; i6 < i2; i6++) {
                i5 += (int) Math.floor((i4 / (i6 + 1)) * i3);
            }
            LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(i5));
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("fetch " + path);
        nutchJob.setInt("fetcher.threads.fetch", i);
        nutchJob.set(Nutch.SEGMENT_NAME_KEY, path.getName());
        nutchJob.setSpeculativeExecution(false);
        FileInputFormat.addInputPath(nutchJob, new Path(path, CrawlDatum.GENERATE_DIR_NAME));
        nutchJob.setInputFormat(InputFormat.class);
        nutchJob.setMapRunnerClass(Fetcher.class);
        FileOutputFormat.setOutputPath(nutchJob, path);
        nutchJob.setOutputFormat(FetcherOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(NutchWritable.class);
        JobClient.runJob(nutchJob);
        long currentTimeMillis3 = System.currentTimeMillis();
        LOG.info("Fetcher: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis3)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis3));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new Fetcher(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length < 1) {
            System.err.println("Usage: Fetcher <segment> [-threads n]");
            return -1;
        }
        Path path = new Path(strArr[0]);
        int i = getConf().getInt("fetcher.threads.fetch", 10);
        int i2 = 1;
        while (i2 < strArr.length) {
            if (strArr[i2].equals("-threads")) {
                i2++;
                i = Integer.parseInt(strArr[i2]);
            }
            i2++;
        }
        getConf().setInt("fetcher.threads.fetch", i);
        try {
            fetch(path, i);
            return 0;
        } catch (Exception e) {
            LOG.error("Fetcher: " + StringUtils.stringifyException(e));
            return -1;
        }
    }

    private void checkConfiguration() {
        String str = getConf().get("http.agent.name");
        if (str == null || str.trim().length() == 0) {
            if (LOG.isErrorEnabled()) {
                LOG.error("Fetcher: No agents listed in 'http.agent.name' property.");
            }
            throw new IllegalArgumentException("Fetcher: No agents listed in 'http.agent.name' property.");
        }
        StringTokenizer stringTokenizer = new StringTokenizer(getConf().get("http.robots.agents"), ",");
        ArrayList arrayList = new ArrayList();
        while (stringTokenizer.hasMoreTokens()) {
            arrayList.add(stringTokenizer.nextToken().trim());
        }
        if (((String) arrayList.get(0)).equalsIgnoreCase(str) || !LOG.isWarnEnabled()) {
            return;
        }
        LOG.warn("Fetcher: Your 'http.agent.name' value should be listed first in 'http.robots.agents' property.");
    }
}
