package org.apache.nutch.parse.zip;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.MimeUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:plugins/parse-zip/parse-zip.jar:org/apache/nutch/parse/zip/ZipTextExtractor.class */
public class ZipTextExtractor {
    private MimeUtil MIME;
    public static final Logger LOG = LoggerFactory.getLogger(ZipTextExtractor.class);
    private Configuration conf;

    public ZipTextExtractor(Configuration configuration) {
        this.conf = configuration;
        this.MIME = new MimeUtil(configuration);
    }

    public String extractText(InputStream inputStream, String str, List list) throws IOException {
        String str2 = "";
        ZipInputStream zipInputStream = new ZipInputStream(inputStream);
        while (true) {
            ZipEntry nextEntry = zipInputStream.getNextEntry();
            if (nextEntry == null) {
                return str2;
            }
            if (!nextEntry.isDirectory()) {
                int size = (int) nextEntry.getSize();
                byte[] bArr = new byte[size];
                for (int i = 0; i < size; i++) {
                    int read = zipInputStream.read();
                    if (read != -1) {
                        bArr[i] = (byte) read;
                    }
                }
                String name = nextEntry.getName();
                String str3 = (str + "/") + name;
                String url = new URL(str3).toString();
                if (name.lastIndexOf(46) != -1) {
                    String mimeType = this.MIME.getMimeType(name);
                    try {
                        Metadata metadata = new Metadata();
                        metadata.set("Content-Length", Long.toString(nextEntry.getSize()));
                        metadata.set("Content-Type", mimeType);
                        Content content = new Content(str3, url, bArr, mimeType, metadata, this.conf);
                        Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
                        Outlink[] outlinks = parse.getData().getOutlinks();
                        for (int i2 = 0; i2 < outlinks.length; i2++) {
                            list.add(new Outlink(outlinks[i2].getToUrl(), outlinks[i2].getAnchor()));
                        }
                        str2 = str2 + nextEntry.getName() + " " + parse.getText() + " ";
                    } catch (ParseException e) {
                        if (LOG.isInfoEnabled()) {
                            LOG.info("fetch okay, but can't parse " + name + ", reason: " + e.getMessage());
                        }
                    }
                }
            }
        }
    }
}
