package org.apache.lenya.search.crawler;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.List;
import java.util.StringTokenizer;
import java.util.TreeSet;
import java.util.Vector;
import org.apache.lenya.util.NamespaceMap;
import org.apache.log4j.Logger;
import websphinx.RobotExclusion;

/* loaded from: input_file:org/apache/lenya/search/crawler/IterativeHTMLCrawler.class */
public class IterativeHTMLCrawler {
    static Logger log;
    Vector urlsToCrawl;
    TreeSet urlsToCrawlLowerCase;
    String url_list_file;
    String html_dump_directory;
    private String rootURL;
    private String[] scopeURL;
    private RobotExclusion robot;
    static Class class$org$apache$lenya$search$crawler$IterativeHTMLCrawler;

    public static void main(String[] strArr) {
        if (strArr.length == 0) {
            System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
            return;
        }
        try {
            if (strArr.length == 1) {
                CrawlerConfiguration crawlerConfiguration = new CrawlerConfiguration(strArr[0]);
                new IterativeHTMLCrawler(new File(strArr[0])).crawl(new URL(crawlerConfiguration.getBaseURL()), crawlerConfiguration.getScopeURL());
            } else {
                System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
            }
        } catch (MalformedURLException e) {
            log.error("Malformed URL: ", e);
        }
    }

    public IterativeHTMLCrawler(String str, String str2, String str3) {
        this.url_list_file = "url_file.txt";
        this.html_dump_directory = "html_dump";
        this.url_list_file = str;
        this.html_dump_directory = str2;
        this.robot = new RobotExclusion(str3);
    }

    public IterativeHTMLCrawler(File file) {
        this.url_list_file = "url_file.txt";
        this.html_dump_directory = "html_dump";
        CrawlerConfiguration crawlerConfiguration = new CrawlerConfiguration(file.getAbsolutePath());
        this.url_list_file = crawlerConfiguration.getURIListResolved();
        log.debug(new StringBuffer().append("URI list file: ").append(this.url_list_file).toString());
        this.html_dump_directory = crawlerConfiguration.getHTDocsDumpDirResolved();
        log.debug(new StringBuffer().append("HTDocs Dump Dir: ").append(this.html_dump_directory).toString());
        this.robot = new RobotExclusion(crawlerConfiguration.getUserAgent());
        String robotsFileResolved = crawlerConfiguration.getRobotsFileResolved();
        log.debug(new StringBuffer().append("Robots File: ").append(robotsFileResolved).toString());
        String robotsDomain = crawlerConfiguration.getRobotsDomain();
        if (robotsFileResolved == null || robotsDomain == null) {
            return;
        }
        log.debug(new StringBuffer().append(robotsFileResolved).append(" ").append(robotsDomain).toString());
    }

    public void crawl(URL url, String str) {
        this.scopeURL = new String[1];
        this.scopeURL[0] = str;
        String url2 = url.toString();
        this.rootURL = url2.substring(0, url2.indexOf("/", 8));
        this.urlsToCrawl = new Vector();
        this.urlsToCrawlLowerCase = new TreeSet();
        String substring = url.toString().substring(0, url.toString().lastIndexOf("/"));
        try {
            log.info(new StringBuffer().append("Start crawling at: ").append(url).toString());
            if (addURL(url.getFile(), substring) != null) {
                dumpHTDoc(url);
            } else {
                log.warn(new StringBuffer().append("Start URL has not been dumped: ").append(url).toString());
            }
        } catch (MalformedURLException e) {
            log.error("Malformed URL: ", e);
        }
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= this.urlsToCrawl.size()) {
                break;
            }
            URL url3 = (URL) this.urlsToCrawl.elementAt(i2);
            String substring2 = url3.toString().substring(0, url3.toString().lastIndexOf("/"));
            log.info(new StringBuffer().append("INFO: Current Array Size: ").append(this.urlsToCrawl.size()).append(", Current Position: ").append(i2).append(", Current URL: ").append(url3.toString()).toString());
            List<String> parsePage = parsePage(url3.toString());
            if (parsePage != null) {
                for (String str2 : parsePage) {
                    try {
                        URL addURL = addURL(str2, substring2);
                        if (addURL != null) {
                            dumpHTDoc(addURL);
                        }
                    } catch (MalformedURLException e2) {
                        log.warn(new StringBuffer().append(e2).append(" ").append(str2).toString());
                    }
                }
            }
            i = i2 + 1;
        }
        log.info(new StringBuffer().append("Stop crawling at: ").append(this.urlsToCrawl.elementAt(this.urlsToCrawl.size() - 1)).toString());
        try {
            File file = new File(new File(this.url_list_file).getParent());
            if (!file.isDirectory()) {
                file.mkdirs();
                log.warn(new StringBuffer().append("Directory has been created: ").append(file).toString());
            }
            PrintWriter printWriter = new PrintWriter(new FileOutputStream(this.url_list_file));
            for (int i3 = 0; i3 < this.urlsToCrawl.size(); i3++) {
                printWriter.println(new StringBuffer().append("").append(this.urlsToCrawl.elementAt(i3)).toString());
            }
            printWriter.close();
        } catch (FileNotFoundException e3) {
            log.error("File not found: ", e3);
        }
    }

    public URL addURL(String str, String str2) throws MalformedURLException {
        URL url = new URL(parseHREF(str, str.toLowerCase(), str2));
        if (!filterURL(str, str2, this.urlsToCrawlLowerCase)) {
            return null;
        }
        if (this.robot.disallowed(url)) {
            log.info(new StringBuffer().append("Disallowed by robots.txt: ").append(str).toString());
            return null;
        }
        if (url.getQuery() == null) {
            this.urlsToCrawl.add(url);
            this.urlsToCrawlLowerCase.add(url.toString().toLowerCase());
            log.debug(new StringBuffer().append("URL added: ").append(url).toString());
        } else {
            log.info(new StringBuffer().append("Don't crawl URLs with query string: ").append(url).toString());
        }
        return url;
    }

    public List parsePage(String str) {
        try {
            HttpURLConnection httpURLConnection = (HttpURLConnection) new URL(str).openConnection();
            httpURLConnection.setRequestProperty("User-Agent", "Lenya Lucene Crawler");
            httpURLConnection.connect();
            if (httpURLConnection.getResponseCode() == 200) {
                String contentType = httpURLConnection.getContentType();
                if (contentType.indexOf("text/html") != -1) {
                    return handleHTML(httpURLConnection);
                }
                if (contentType.indexOf("application/pdf") != -1) {
                    handlePDF(httpURLConnection);
                }
            }
            httpURLConnection.disconnect();
            return null;
        } catch (MalformedURLException e) {
            log.debug(new StringBuffer().append("status=").append(e).toString());
            return null;
        } catch (UnknownHostException e2) {
            log.debug(new StringBuffer().append("status=").append(e2).toString());
            return null;
        } catch (IOException e3) {
            log.debug(new StringBuffer().append("status=").append(e3).toString());
            return null;
        } catch (Exception e4) {
            log.debug(new StringBuffer().append("status=").append(e4).toString());
            return null;
        }
    }

    public static List handleHTML(HttpURLConnection httpURLConnection) throws IOException {
        HTMLHandler hTMLHandler = new HTMLHandler();
        hTMLHandler.parse(httpURLConnection.getInputStream());
        if (hTMLHandler.getRobotFollow()) {
            return hTMLHandler.getLinks();
        }
        return null;
    }

    public void handlePDF(HttpURLConnection httpURLConnection) {
        log.debug(".handlePDF(): Not handled yet!");
    }

    public boolean filterURL(String str, String str2, TreeSet treeSet) {
        String lowerCase = str.toLowerCase();
        if (!lowerCase.startsWith("http://") && !lowerCase.startsWith("https://")) {
            str = parseHREF(str, lowerCase, str2);
            if (str != null) {
                lowerCase = str.toLowerCase();
            }
        }
        if (str != null && inScope(str)) {
            return !treeSet.contains(lowerCase);
        }
        log.debug(new StringBuffer().append("Not in scope: ").append(str).toString());
        return false;
    }

    public String parseHREF(String str, String str2, String str3) {
        String stringBuffer;
        int indexOf;
        if (str2.startsWith("http://") || str2.startsWith("https://")) {
            return str;
        }
        if (str2.startsWith("/")) {
            stringBuffer = new StringBuffer().append(this.rootURL).append(str).toString();
        } else if (str2.startsWith("./")) {
            stringBuffer = new StringBuffer().append(str3).append(str.substring(1, str.length())).toString();
        } else if (str2.startsWith("../")) {
            int i = 1;
            while (str2.indexOf("../", i * 3) != -1) {
                i++;
            }
            int length = str3.length();
            int i2 = i;
            while (true) {
                int i3 = i2;
                i2 = i3 - 1;
                if (i3 <= 0) {
                    break;
                }
                length = str3.lastIndexOf("/", length) - 1;
            }
            String substring = str.substring(3 * i, str.length());
            if (substring.length() <= 0 || substring.charAt(0) != '.') {
                stringBuffer = new StringBuffer().append(str3.substring(0, length + 2)).append(substring).toString();
            } else {
                log.error(new StringBuffer().append("Parsing failed: ").append(str).append(" (").append(str3).append(")").toString());
                stringBuffer = null;
            }
        } else if (str2.startsWith("javascript:")) {
            log.debug("\"javascript:\" is not implemented yet!");
            stringBuffer = null;
        } else if (str2.startsWith("#")) {
            log.debug("\"#\" (anchor) will be ignored!");
            stringBuffer = null;
        } else if (str2.startsWith("mailto:")) {
            log.debug("\"mailto:\" is not a URL to be followed!");
            stringBuffer = null;
        } else {
            stringBuffer = str2.equals("") ? null : new StringBuffer().append(str3).append("/").append(str).toString();
        }
        if (stringBuffer != null && (indexOf = stringBuffer.indexOf("#")) != -1) {
            stringBuffer = stringBuffer.substring(0, indexOf);
        }
        return stringBuffer;
    }

    public boolean inScope(String str) {
        for (int i = 0; i < this.scopeURL.length; i++) {
            if (str.startsWith(this.scopeURL[i])) {
                return true;
            }
        }
        return false;
    }

    public URL completeURL(URL url, String str) throws MalformedURLException {
        return url;
    }

    public void dumpHTDoc(URL url) {
        String extension = getExtension(url);
        String stringBuffer = new StringBuffer().append(this.html_dump_directory).append(url.getFile()).toString();
        File file = new File(stringBuffer);
        if (stringBuffer.charAt(stringBuffer.length() - 1) == '/') {
            file = new File(new StringBuffer().append(stringBuffer).append("index.html").toString());
            extension = getExtension(file);
        }
        if (extension == null || !(extension.equals("html") || extension.equals("htm") || extension.equals("txt") || extension.equals("pdf"))) {
            log.info(new StringBuffer().append("URL not dumped: ").append(url).toString());
            return;
        }
        try {
            File file2 = new File(file.getParent());
            if (!file2.exists()) {
                file2.mkdirs();
            }
            HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
            InputStream inputStream = httpURLConnection.getInputStream();
            FileOutputStream fileOutputStream = new FileOutputStream(file);
            byte[] bArr = new byte[1024];
            while (true) {
                int read = inputStream.read(bArr);
                if (read < 0) {
                    fileOutputStream.close();
                    inputStream.close();
                    httpURLConnection.disconnect();
                    log.info(new StringBuffer().append("URL dumped: ").append(url).append(" (").append(file).append(")").toString());
                    return;
                }
                fileOutputStream.write(bArr, 0, read);
            }
        } catch (Exception e) {
            log.error(new StringBuffer().append("").append(e).toString());
            log.error(new StringBuffer().append("URL not dumped: ").append(url).toString());
        }
    }

    public String getExtension(URL url) {
        return getExtension(new File(url.getPath()));
    }

    public String getExtension(File file) {
        StringTokenizer stringTokenizer = new StringTokenizer(file.getPath(), NamespaceMap.SEPARATOR);
        String str = null;
        while (true) {
            String str2 = str;
            if (!stringTokenizer.hasMoreElements()) {
                return str2;
            }
            str = stringTokenizer.nextToken();
        }
    }

    static Class class$(String str) {
        try {
            return Class.forName(str);
        } catch (ClassNotFoundException e) {
            throw new NoClassDefFoundError().initCause(e);
        }
    }

    static {
        Class cls;
        if (class$org$apache$lenya$search$crawler$IterativeHTMLCrawler == null) {
            cls = class$("org.apache.lenya.search.crawler.IterativeHTMLCrawler");
            class$org$apache$lenya$search$crawler$IterativeHTMLCrawler = cls;
        } else {
            cls = class$org$apache$lenya$search$crawler$IterativeHTMLCrawler;
        }
        log = Logger.getLogger(cls);
    }
}
