package org.apache.nutch.crawl;

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/CrawlDbFilter.class */
public class CrawlDbFilter implements Mapper<Text, CrawlDatum, Text, CrawlDatum> {
    public static final String URL_FILTERING = "crawldb.url.filters";
    public static final String URL_NORMALIZING = "crawldb.url.normalizers";
    public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";
    private boolean urlFiltering;
    private boolean urlNormalizers;
    private boolean url404Purging;
    private URLFilters filters;
    private URLNormalizers normalizers;
    private String scope;
    public static final Logger LOG = LoggerFactory.getLogger(CrawlDbFilter.class);
    private Text newKey = new Text();

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        this.urlFiltering = jobConf.getBoolean(URL_FILTERING, false);
        this.urlNormalizers = jobConf.getBoolean(URL_NORMALIZING, false);
        this.url404Purging = jobConf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
        if (this.urlFiltering) {
            this.filters = new URLFilters(jobConf);
        }
        if (this.urlNormalizers) {
            this.scope = jobConf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
            this.normalizers = new URLNormalizers(jobConf, this.scope);
        }
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() {
    }

    @Override // org.apache.hadoop.mapred.Mapper
    public void map(Text text, CrawlDatum crawlDatum, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
        String text2 = text.toString();
        if (this.url404Purging && 3 == crawlDatum.getStatus()) {
            text2 = null;
        }
        if (this.urlNormalizers) {
            try {
                text2 = this.normalizers.normalize(text2, this.scope);
            } catch (Exception e) {
                LOG.warn("Skipping " + text2 + ":" + e);
                text2 = null;
            }
        }
        if (text2 != null && this.urlFiltering) {
            try {
                text2 = this.filters.filter(text2);
            } catch (Exception e2) {
                LOG.warn("Skipping " + text2 + ":" + e2);
                text2 = null;
            }
        }
        if (text2 != null) {
            this.newKey.set(text2);
            outputCollector.collect(this.newKey, crawlDatum);
        }
    }
}
