package org.apache.nutch.crawl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/CrawlDbReducer.class */
public class CrawlDbReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
    public static final Logger LOG = LoggerFactory.getLogger(CrawlDbReducer.class);
    private int retryMax;
    private CrawlDatum result = new CrawlDatum();
    private InlinkPriorityQueue linked = null;
    private ScoringFilters scfilters = null;
    private boolean additionsAllowed;
    private int maxInterval;
    private FetchSchedule schedule;

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        this.retryMax = jobConf.getInt("db.fetch.retry.max", 3);
        this.scfilters = new ScoringFilters(jobConf);
        this.additionsAllowed = jobConf.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
        int i = jobConf.getInt("db.max.fetch.interval", 0);
        this.maxInterval = jobConf.getInt("db.fetch.interval.max", 0);
        if (i > 0 && this.maxInterval == 0) {
            this.maxInterval = i * FetchSchedule.SECONDS_PER_DAY;
        }
        this.schedule = FetchScheduleFactory.getFetchSchedule(jobConf);
        this.linked = new InlinkPriorityQueue(jobConf.getInt("db.update.max.inlinks", 10000));
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() {
    }

    @Override // org.apache.hadoop.mapred.Reducer
    public void reduce(Text text, Iterator<CrawlDatum> it, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
        CrawlDatum crawlDatum;
        CrawlDatum crawlDatum2 = new CrawlDatum();
        CrawlDatum crawlDatum3 = new CrawlDatum();
        boolean z = false;
        boolean z2 = false;
        byte[] bArr = null;
        boolean z3 = false;
        this.linked.clear();
        org.apache.hadoop.io.MapWritable mapWritable = null;
        while (it.hasNext()) {
            CrawlDatum next = it.next();
            if (!z3 && it.hasNext()) {
                z3 = true;
            }
            if (!CrawlDatum.hasDbStatus(next)) {
                if (!CrawlDatum.hasFetchStatus(next)) {
                    switch (next.getStatus()) {
                        case 65:
                            bArr = next.getSignature();
                            break;
                        case 66:
                        default:
                            LOG.warn("Unknown status, key: " + text + ", datum: " + next);
                            break;
                        case 67:
                            if (z3) {
                                crawlDatum = new CrawlDatum();
                                crawlDatum.set(next);
                            } else {
                                crawlDatum = next;
                            }
                            this.linked.insert(crawlDatum);
                            break;
                        case 68:
                            mapWritable = next.getMetaData();
                            break;
                    }
                } else if (!z) {
                    if (z3) {
                        crawlDatum2.set(next);
                    } else {
                        crawlDatum2 = next;
                    }
                    z = true;
                } else if (crawlDatum2.getFetchTime() < next.getFetchTime()) {
                    crawlDatum2.set(next);
                }
            } else if (!z2) {
                if (z3) {
                    crawlDatum3.set(next);
                } else {
                    crawlDatum3 = next;
                }
                z2 = true;
            } else if (crawlDatum3.getFetchTime() < next.getFetchTime()) {
                crawlDatum3.set(next);
            }
        }
        int size = this.linked.size();
        ArrayList arrayList = new ArrayList(size);
        for (int i = size - 1; i >= 0; i--) {
            arrayList.add(this.linked.pop());
        }
        if (z2 || this.additionsAllowed) {
            if (!z && arrayList.size() > 0) {
                crawlDatum2 = (CrawlDatum) arrayList.get(0);
                z = true;
            }
            if (!z) {
                if (!z2) {
                    LOG.warn("Missing fetch and old value, signature=" + bArr);
                    return;
                } else {
                    outputCollector.collect(text, crawlDatum3);
                    reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(crawlDatum3.getStatus())).increment(1L);
                    return;
                }
            }
            if (bArr == null) {
                bArr = crawlDatum2.getSignature();
            }
            long modifiedTime = z2 ? crawlDatum3.getModifiedTime() : 0L;
            long fetchTime = z2 ? crawlDatum3.getFetchTime() : 0L;
            this.result.set(crawlDatum2);
            if (z2) {
                if (crawlDatum3.getMetaData().size() > 0) {
                    this.result.putAllMetaData(crawlDatum3);
                    if (crawlDatum2.getMetaData().size() > 0) {
                        this.result.putAllMetaData(crawlDatum2);
                    }
                }
                if (crawlDatum3.getModifiedTime() > 0 && crawlDatum2.getModifiedTime() == 0) {
                    this.result.setModifiedTime(crawlDatum3.getModifiedTime());
                }
            }
            switch (crawlDatum2.getStatus()) {
                case 33:
                case 35:
                case 36:
                case 38:
                    int i2 = 0;
                    if (crawlDatum2.getStatus() == 38) {
                        i2 = 2;
                    } else if (z2 && crawlDatum3.getSignature() != null && bArr != null) {
                        i2 = SignatureComparator._compare(crawlDatum3.getSignature(), bArr) != 0 ? 1 : 2;
                    }
                    this.result = this.schedule.setFetchSchedule(text, this.result, fetchTime, modifiedTime, crawlDatum2.getFetchTime(), crawlDatum2.getModifiedTime(), i2);
                    if (i2 == 2) {
                        this.result.setStatus(6);
                        if (z2) {
                            this.result.setSignature(crawlDatum3.getSignature());
                        }
                    } else {
                        switch (crawlDatum2.getStatus()) {
                            case 33:
                                this.result.setStatus(2);
                                break;
                            case 34:
                            default:
                                LOG.warn("Unexpected status: " + ((int) crawlDatum2.getStatus()) + " resetting to old status.");
                                if (z2) {
                                    this.result.setStatus(crawlDatum3.getStatus());
                                    break;
                                } else {
                                    this.result.setStatus(1);
                                    break;
                                }
                            case 35:
                                this.result.setStatus(4);
                                break;
                            case 36:
                                this.result.setStatus(5);
                                break;
                        }
                        this.result.setSignature(bArr);
                        if (mapWritable != null) {
                            for (Map.Entry<Writable, Writable> entry : mapWritable.entrySet()) {
                                this.result.getMetaData().put(entry.getKey(), entry.getValue());
                            }
                        }
                    }
                    if (this.maxInterval < this.result.getFetchInterval()) {
                        this.result = this.schedule.forceRefetch(text, this.result, false);
                        break;
                    }
                    break;
                case 34:
                    if (z2) {
                        this.result.setSignature(crawlDatum3.getSignature());
                    }
                    this.result = this.schedule.setPageRetrySchedule(text, this.result, fetchTime, modifiedTime, crawlDatum2.getFetchTime());
                    if (this.result.getRetriesSinceFetch() < this.retryMax) {
                        this.result.setStatus(1);
                        break;
                    } else {
                        this.result.setStatus(3);
                        break;
                    }
                case 37:
                    if (z2) {
                        this.result.setSignature(crawlDatum3.getSignature());
                    }
                    this.result.setStatus(3);
                    this.result = this.schedule.setPageGoneSchedule(text, this.result, fetchTime, modifiedTime, crawlDatum2.getFetchTime());
                    break;
                case 65:
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + text);
                        return;
                    }
                    return;
                case 67:
                    if (z2) {
                        this.result.set(crawlDatum3);
                        break;
                    } else {
                        this.result = this.schedule.initializeSchedule(text, this.result);
                        this.result.setStatus(1);
                        try {
                            this.scfilters.initialScore(text, this.result);
                            break;
                        } catch (ScoringFilterException e) {
                            if (LOG.isWarnEnabled()) {
                                LOG.warn("Cannot filter init score for url " + text + ", using default: " + e.getMessage());
                            }
                            this.result.setScore(0.0f);
                            break;
                        }
                    }
                default:
                    throw new RuntimeException("Unknown status: " + ((int) crawlDatum2.getStatus()) + " " + text);
            }
            try {
                this.scfilters.updateDbScore(text, z2 ? crawlDatum3 : null, this.result, arrayList);
            } catch (Exception e2) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Couldn't update score, key=" + text + ": " + e2);
                }
            }
            this.result.getMetaData().remove((Object) Nutch.WRITABLE_GENERATE_TIME_KEY);
            outputCollector.collect(text, this.result);
            reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(this.result.getStatus())).increment(1L);
        }
    }
}
