package org.apache.nutch.parse.feed;

import com.sun.syndication.feed.synd.SyndCategory;
import com.sun.syndication.feed.synd.SyndContent;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.feed.synd.SyndPerson;
import com.sun.syndication.io.SyndFeedInput;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.jempbox.xmp.ResourceEvent;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.parse.ParserNotFound;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.EncodingDetector;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;

/* loaded from: input_file:plugins/feed/feed.jar:org/apache/nutch/parse/feed/FeedParser.class */
public class FeedParser implements Parser {
    public static final String CHARSET_UTF8 = "charset=UTF-8";
    public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; charset=UTF-8";
    public static final Logger LOG = LoggerFactory.getLogger(FeedParser.class);
    private Configuration conf;
    private ParserFactory parserFactory;
    private URLNormalizers normalizers;
    private URLFilters filters;
    private String defaultEncoding;

    public ParseResult getParse(Content content) {
        String str;
        ParseResult parseResult = new ParseResult(content.getUrl());
        EncodingDetector encodingDetector = new EncodingDetector(this.conf);
        encodingDetector.autoDetectClues(content, true);
        String guessEncoding = encodingDetector.guessEncoding(content, this.defaultEncoding);
        try {
            InputSource inputSource = new InputSource(new ByteArrayInputStream(content.getContent()));
            inputSource.setEncoding(guessEncoding);
            SyndFeed build = new SyndFeedInput().build(inputSource);
            List entries = build.getEntries();
            try {
                str = this.normalizers.normalize(build.getLink(), "outlink");
                if (str != null) {
                    str = this.filters.filter(str);
                }
            } catch (Exception e) {
                str = null;
            }
            Iterator it = entries.iterator();
            while (it.hasNext()) {
                addToMap(parseResult, build, str, (SyndEntry) it.next(), content);
            }
            parseResult.put(content.getUrl(), new ParseText(stripTags(build.getDescriptionEx())), new ParseData(new ParseStatus(1), stripTags(build.getTitleEx()), new Outlink[0], content.getMetadata()));
            return parseResult;
        } catch (Exception e2) {
            LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e2));
            return new ParseStatus(e2).getEmptyParseResult(content.getUrl(), getConf());
        }
    }

    public void setConf(Configuration configuration) {
        this.conf = configuration;
        this.parserFactory = new ParserFactory(configuration);
        this.normalizers = new URLNormalizers(configuration, "outlink");
        this.filters = new URLFilters(configuration);
        this.defaultEncoding = configuration.get("parser.character.encoding.default", "windows-1252");
    }

    public Configuration getConf() {
        return this.conf;
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length != 1) {
            System.err.println("Usage: FeedParser <feed>");
            System.exit(1);
        }
        String str = strArr[0];
        String str2 = "file:" + str;
        Configuration create = NutchConfiguration.create();
        FeedParser feedParser = new FeedParser();
        feedParser.setConf(create);
        File file = new File(str);
        byte[] bArr = new byte[(int) file.length()];
        new DataInputStream(new FileInputStream(file)).readFully(bArr);
        Iterator it = feedParser.getParse(new Content(str2, str2, bArr, "application/rss+xml", new Metadata(), create)).iterator();
        while (it.hasNext()) {
            Map.Entry entry = (Map.Entry) it.next();
            System.out.println("key: " + entry.getKey());
            Parse parse = (Parse) entry.getValue();
            System.out.println("data: " + parse.getData());
            System.out.println("text: " + parse.getText() + "\n");
        }
    }

    private void addToMap(ParseResult parseResult, SyndFeed syndFeed, String str, SyndEntry syndEntry, Content content) {
        String link = syndEntry.getLink();
        String str2 = null;
        Metadata metadata = new Metadata();
        Metadata metadata2 = content.getMetadata();
        Parse parse = null;
        SyndContent description = syndEntry.getDescription();
        try {
            String normalize = this.normalizers.normalize(link, "outlink");
            if (normalize != null) {
                normalize = this.filters.filter(normalize);
            }
            if (normalize == null) {
                return;
            }
            String stripTags = stripTags(syndEntry.getTitleEx());
            if (str != null) {
                metadata.set("feed", str);
            }
            addFields(metadata, metadata2, syndFeed, syndEntry);
            String str3 = metadata2.get("Content-Type");
            if (description != null) {
                str2 = description.getValue();
            }
            if (str2 == null) {
                List contents = syndEntry.getContents();
                StringBuilder sb = new StringBuilder();
                Iterator it = contents.iterator();
                while (it.hasNext()) {
                    sb.append(((SyndContent) it.next()).getValue());
                }
                str2 = sb.toString();
            }
            try {
                parse = this.parserFactory.getParsers(str3, normalize)[0].getParse(new Content(normalize, normalize, str2.getBytes(), str3, metadata2, this.conf)).get(normalize);
            } catch (ParserNotFound e) {
            }
            if (parse == null) {
                metadata2.remove("Content-Type");
                parseResult.put(normalize, new ParseText(str2), new ParseData(ParseStatus.STATUS_FAILURE, stripTags, new Outlink[0], metadata2, metadata));
            } else {
                ParseData data = parse.getData();
                data.getContentMeta().remove("Content-Type");
                mergeMetadata(data.getParseMeta(), metadata);
                parseResult.put(normalize, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, stripTags, data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
            }
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    private static String stripTags(SyndContent syndContent) {
        if (syndContent == null) {
            return "";
        }
        String[] split = syndContent.getValue().split("<[^>]*>");
        StringBuffer stringBuffer = new StringBuffer();
        for (String str : split) {
            stringBuffer.append(str);
        }
        return stringBuffer.toString().trim();
    }

    private void addFields(Metadata metadata, Metadata metadata2, SyndFeed syndFeed, SyndEntry syndEntry) {
        List authors = syndEntry.getAuthors();
        List categories = syndEntry.getCategories();
        Date publishedDate = syndEntry.getPublishedDate();
        Date updatedDate = syndEntry.getUpdatedDate();
        String str = null;
        if (authors != null) {
            Iterator it = authors.iterator();
            while (it.hasNext()) {
                String name = ((SyndPerson) it.next()).getName();
                if (checkString(name)) {
                    metadata.add("author", name);
                }
            }
        } else {
            String author = syndEntry.getAuthor();
            if (checkString(author)) {
                metadata.set("author", author);
            }
        }
        Iterator it2 = categories.iterator();
        while (it2.hasNext()) {
            metadata.add("tag", ((SyndCategory) it2.next()).getName());
        }
        if (publishedDate != null) {
            metadata.set(ResourceEvent.ACTION_PUBLISHED, Long.toString(publishedDate.getTime()));
        }
        if (updatedDate != null) {
            metadata.set("updated", Long.toString(updatedDate.getTime()));
        }
        SyndContent description = syndEntry.getDescription();
        if (description != null) {
            str = description.getType();
        } else {
            List contents = syndEntry.getContents();
            if (contents.size() > 0) {
                str = ((SyndContent) contents.get(0)).getType();
            }
        }
        if (!checkString(str)) {
            metadata2.set("Content-Type", TEXT_PLAIN_CONTENT_TYPE);
            return;
        }
        if (str.equals("html")) {
            str = "text/html";
        } else if (str.equals(com.sun.syndication.feed.atom.Content.XHTML)) {
            str = "text/xhtml";
        }
        metadata2.set("Content-Type", str + "; " + CHARSET_UTF8);
    }

    private void mergeMetadata(Metadata metadata, Metadata metadata2) {
        for (String str : metadata2.names()) {
            for (String str2 : metadata2.getValues(str)) {
                metadata.add(str, str2);
            }
        }
    }

    private boolean checkString(String str) {
        return (str == null || str.equals("")) ? false : true;
    }
}
