package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;

/* loaded from: input_file:edu/uci/ics/crawler4j/parser/Parser.class */
public class Parser extends Configurable {
    private HtmlParser htmlParser;
    private ParseContext parseContext;

    public Parser(CrawlConfig crawlConfig) {
        super(crawlConfig);
        this.htmlParser = new HtmlParser();
        this.parseContext = new ParseContext();
    }

    public boolean parse(Page page, String str) {
        String canonicalURL;
        if (Util.hasBinaryContent(page.getContentType())) {
            if (!this.config.isIncludeBinaryContentInCrawling()) {
                return false;
            }
            page.setParseData(BinaryParseData.getInstance());
            return true;
        }
        if (Util.hasPlainTextContent(page.getContentType())) {
            try {
                TextParseData textParseData = new TextParseData();
                textParseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
                page.setParseData(textParseData);
                return true;
            } catch (Exception e) {
                e.printStackTrace();
                return false;
            }
        }
        Metadata metadata = new Metadata();
        HtmlContentHandler htmlContentHandler = new HtmlContentHandler();
        ByteArrayInputStream byteArrayInputStream = null;
        try {
            try {
                byteArrayInputStream = new ByteArrayInputStream(page.getContentData());
                this.htmlParser.parse(byteArrayInputStream, htmlContentHandler, metadata, this.parseContext);
                if (byteArrayInputStream != null) {
                    try {
                        byteArrayInputStream.close();
                    } catch (IOException e2) {
                        e2.printStackTrace();
                    }
                }
            } catch (Exception e3) {
                e3.printStackTrace();
                if (byteArrayInputStream != null) {
                    try {
                        byteArrayInputStream.close();
                    } catch (IOException e4) {
                        e4.printStackTrace();
                    }
                }
            }
            if (page.getContentCharset() == null) {
                page.setContentCharset(metadata.get("Content-Encoding"));
            }
            HtmlParseData htmlParseData = new HtmlParseData();
            htmlParseData.setText(htmlContentHandler.getBodyText().trim());
            htmlParseData.setTitle(metadata.get("title"));
            ArrayList arrayList = new ArrayList();
            String baseUrl = htmlContentHandler.getBaseUrl();
            if (baseUrl != null) {
                str = baseUrl;
            }
            int i = 0;
            for (ExtractedUrlAnchorPair extractedUrlAnchorPair : htmlContentHandler.getOutgoingUrls()) {
                String trim = extractedUrlAnchorPair.getHref().trim();
                if (trim.length() != 0) {
                    String lowerCase = trim.toLowerCase();
                    if (trim.startsWith("http://")) {
                        lowerCase = trim.substring(7);
                    }
                    if (!lowerCase.contains("javascript:") && !lowerCase.contains("@") && (canonicalURL = URLCanonicalizer.getCanonicalURL(trim, str)) != null) {
                        WebURL webURL = new WebURL();
                        webURL.setURL(canonicalURL);
                        webURL.setAnchor(extractedUrlAnchorPair.getAnchor());
                        arrayList.add(webURL);
                        i++;
                        if (i > this.config.getMaxOutgoingLinksToFollow()) {
                            break;
                        }
                    }
                }
            }
            htmlParseData.setOutgoingUrls(arrayList);
            try {
                if (page.getContentCharset() == null) {
                    htmlParseData.setHtml(new String(page.getContentData()));
                } else {
                    htmlParseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
                }
                page.setParseData(htmlParseData);
                return true;
            } catch (UnsupportedEncodingException e5) {
                e5.printStackTrace();
                return false;
            }
        } catch (Throwable th) {
            if (byteArrayInputStream != null) {
                try {
                    byteArrayInputStream.close();
                } catch (IOException e6) {
                    e6.printStackTrace();
                    throw th;
                }
            }
            throw th;
        }
    }
}
