package org.apache.nutch.tools.arc;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/tools/arc/ArcSegmentCreator.class */
public class ArcSegmentCreator extends Configured implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> {
    public static final String URL_VERSION = "arc.url.version";
    private JobConf jobConf;
    private URLFilters urlFilters;
    private ScoringFilters scfilters;
    private ParseUtil parseUtil;
    private URLNormalizers normalizers;
    private int interval;
    public static final Logger LOG = LoggerFactory.getLogger(ArcSegmentCreator.class);
    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

    public ArcSegmentCreator() {
    }

    public ArcSegmentCreator(Configuration configuration) {
        setConf(configuration);
    }

    public static synchronized String generateSegmentName() {
        try {
            Thread.sleep(1000L);
        } catch (Throwable th) {
        }
        return sdf.format(new Date(System.currentTimeMillis()));
    }

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        this.jobConf = jobConf;
        this.urlFilters = new URLFilters(this.jobConf);
        this.scfilters = new ScoringFilters(this.jobConf);
        this.parseUtil = new ParseUtil(this.jobConf);
        this.normalizers = new URLNormalizers(this.jobConf, URLNormalizers.SCOPE_FETCHER);
        this.interval = this.jobConf.getInt("db.fetch.interval.default", 2592000);
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() {
    }

    private ParseStatus output(OutputCollector<Text, NutchWritable> outputCollector, String str, Text text, CrawlDatum crawlDatum, Content content, ProtocolStatus protocolStatus, int i) {
        Parse parse;
        crawlDatum.setStatus(i);
        crawlDatum.setFetchTime(System.currentTimeMillis());
        if (protocolStatus != null) {
            crawlDatum.getMetaData().put((Writable) Nutch.WRITABLE_PROTO_STATUS_KEY, (Writable) protocolStatus);
        }
        ParseResult parseResult = null;
        if (content == null) {
            return null;
        }
        content.getMetadata().set(Nutch.SEGMENT_NAME_KEY, str);
        try {
            this.scfilters.passScoreBeforeParsing(text, crawlDatum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + text + " (" + e + ")");
            }
        }
        try {
            parseResult = this.parseUtil.parse(content);
        } catch (Exception e2) {
            LOG.warn("Error parsing: " + text + ": " + StringUtils.stringifyException(e2));
        }
        if (parseResult == null) {
            crawlDatum.setSignature(SignatureFactory.getSignature(getConf()).calculate(content, new ParseStatus().getEmptyParse(getConf())));
        }
        try {
            outputCollector.collect(text, new NutchWritable(crawlDatum));
            outputCollector.collect(text, new NutchWritable(content));
            if (parseResult != null) {
                Iterator<Map.Entry<Text, Parse>> it = parseResult.iterator();
                while (it.hasNext()) {
                    Map.Entry<Text, Parse> next = it.next();
                    Text key = next.getKey();
                    Parse value = next.getValue();
                    ParseStatus status = value.getData().getStatus();
                    if (!status.isSuccess()) {
                        LOG.warn("Error parsing: " + text + ": " + status);
                        value = status.getEmptyParse(getConf());
                    }
                    byte[] calculate = SignatureFactory.getSignature(getConf()).calculate(content, value);
                    value.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, str);
                    value.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(calculate));
                    value.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(crawlDatum.getFetchTime()));
                    if (key.equals(text)) {
                        crawlDatum.setSignature(calculate);
                    }
                    try {
                        this.scfilters.passScoreAfterParsing(key, content, value);
                    } catch (Exception e3) {
                        if (LOG.isWarnEnabled()) {
                            LOG.warn("Couldn't pass score, url " + text + " (" + e3 + ")");
                        }
                    }
                    outputCollector.collect(key, new NutchWritable(new ParseImpl(new ParseText(value.getText()), value.getData(), value.isCanonical())));
                }
            }
        } catch (IOException e4) {
            if (LOG.isErrorEnabled()) {
                LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException(e4));
            }
        }
        if (parseResult == null || parseResult.isEmpty() || (parse = parseResult.get(content.getUrl())) == null) {
            return null;
        }
        return parse.getData().getStatus();
    }

    private void logError(Text text, Throwable th) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Conversion of " + text + " failed with: " + StringUtils.stringifyException(th));
        }
    }

    @Override // org.apache.hadoop.mapred.Mapper
    public void map(Text text, BytesWritable bytesWritable, OutputCollector<Text, NutchWritable> outputCollector, Reporter reporter) throws IOException {
        String str;
        String[] split = text.toString().split("\\s+");
        String str2 = split[0];
        String str3 = split[2];
        String str4 = split[3];
        if (str2.startsWith("filedesc://")) {
            LOG.info("Ignoring file header: " + str2);
            return;
        }
        LOG.info("Processing: " + str2);
        Text text2 = new Text();
        CrawlDatum crawlDatum = new CrawlDatum(2, this.interval, 1.0f);
        String str5 = getConf().get(Nutch.SEGMENT_NAME_KEY);
        try {
            str = this.urlFilters.filter(this.normalizers.normalize(str2, URLNormalizers.SCOPE_FETCHER));
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Skipping " + text2 + ":" + e);
            }
            str = null;
        }
        if (str != null) {
            text2.set(str);
            try {
                ProtocolStatus protocolStatus = ProtocolStatus.STATUS_SUCCESS;
                Content content = new Content(str, str, bytesWritable.getBytes(), str4, new Metadata(), getConf());
                content.getMetadata().set(URL_VERSION, str3);
                output(outputCollector, str5, text2, crawlDatum, content, protocolStatus, 33);
                reporter.progress();
            } catch (Throwable th) {
                logError(text2, th);
                output(outputCollector, str5, text2, crawlDatum, null, null, 34);
            }
        }
    }

    public void createSegments(Path path, Path path2) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("ArcSegmentCreator: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
            LOG.info("ArcSegmentCreator: arc files dir: " + path);
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("ArcSegmentCreator " + path);
        String generateSegmentName = generateSegmentName();
        nutchJob.set(Nutch.SEGMENT_NAME_KEY, generateSegmentName);
        FileInputFormat.addInputPath(nutchJob, path);
        nutchJob.setInputFormat(ArcInputFormat.class);
        nutchJob.setMapperClass(ArcSegmentCreator.class);
        FileOutputFormat.setOutputPath(nutchJob, new Path(path2, generateSegmentName));
        nutchJob.setOutputFormat(FetcherOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(NutchWritable.class);
        JobClient.runJob(nutchJob);
        long currentTimeMillis2 = System.currentTimeMillis();
        LOG.info("ArcSegmentCreator: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new ArcSegmentCreator(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length < 2) {
            System.err.println("Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>");
            return -1;
        }
        try {
            createSegments(new Path(strArr[0]), new Path(strArr[1]));
            return 0;
        } catch (Exception e) {
            LOG.error("ArcSegmentCreator: " + StringUtils.stringifyException(e));
            return -1;
        }
    }
}
