How to write a basic Web Crawler

§How to write a basic Web Crawler

§Installation

  1. IntelliJ IDEA or Eclipse

  2. Crawler4j: Download latest crawler4j-x.x-jar-with-dependencies.jar

  3. Creat new a new project & Add External JARs

    Project Structure (CTRL +SHIFT +ALT + S on Windows/Linux, + ; on Mac OS X) -> Libraries , click +

  4. Write classes as Quickstart

    Controller with main function

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    public class Controller {
    public static void main(String[] args) throws Exception {
    String crawlStorageFolder = "/data/crawl";
    int numberOfCrawlers = 7;
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);
    /*
    * Instantiate the controller for this crawl.
    */

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    /*
    * For each crawl, you need to add some seed urls. These are the first
    * URLs that are fetched and then the crawler starts following links
    * which are found in these pages
    */

    controller.addSeed("http://www.viterbi.usc.edu/");
    /*
    * Start the crawl. This is a blocking operation, meaning that your code
    * will reach the line after this only when crawling is finished.
    */

    controller.start(MyCrawler.class, numberOfCrawlers);
    }
    }

    MyCrawler extends WebCrawler

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    public class MyCrawler extends WebCrawler {
    private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
    + "|png|mp3|mp3|zip|gz))$");
    /**
    * This method receives two parameters. The first parameter is the page
    * in which we have discovered this new url and the second parameter is
    * the new url. You should implement this function to specify whether
    * the given url should be crawled or not (based on your crawling logic).
    * In this example, we are instructing the crawler to ignore urls that
    * have css, js, git, ... extensions and to only accept urls that start
    * with "http://www.viterbi.usc.edu/". In this case, we didn't need the
    * referringPage parameter to make the decision.
    */

    @Override
    public boolean shouldVisit(Page referringPage, WebURL url) {
    String href = url.getURL().toLowerCase();
    return !FILTERS.matcher(href).matches()
    && href.startsWith("http://www.viterbi.usc.edu/");
    }

    /**
    * This function is called when a page is fetched and ready
    * to be processed by your program.
    */

    @Override
    public void visit(Page page) {
    String url = page.getWebURL().getURL();
    System.out.println("URL: " + url);
    if (page.getParseData() instanceof HtmlParseData) {
    HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
    String text = htmlParseData.getText();
    String html = htmlParseData.getHtml();
    Set<WebURL> links = htmlParseData.getOutgoingUrls();

    System.out.println("Text length: " + text.length());
    System.out.println("Html length: " + html.length());
    System.out.println("Number of outgoing links: " + links.size());
    }
    }
    }

    Do not forget import!

    1
    2
    3
    4
    5
    6

    import edu.uci.ics.crawler4j.crawler.CrawlConfig;
    import edu.uci.ics.crawler4j.crawler.CrawlController;
    import edu.uci.ics.crawler4j.fetcher.PageFetcher;
    import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
    import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
    1
    2
    3
    4
    5
    6
    7
    import java.util.Set;
    import java.util.regex.Pattern;

    import edu.uci.ics.crawler4j.crawler.Page;
    import edu.uci.ics.crawler4j.crawler.WebCrawler;
    import edu.uci.ics.crawler4j.parser.HtmlParseData;
    import edu.uci.ics.crawler4j.url.WebURL;

    And change your main class to com.company.Controller

  5. Some Issue

    1. SLF4J: Failed to load class “org.slf4j.impl.StaticLoggerBinder”.
      SLF4J: Defaulting to no-operation (NOP) logger implementation
      SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.

      Go to https://www.slf4j.org/download.html and download the newest slf4j package in http://repo2.maven.org/maven2/org/slf4j/slf4j-simple/1.7.25/.

      Add slf4j-simple-1.7.25jar to your project

    2. Exception in thread “main” java.lang.Exception: couldn’t create the storage folder: /data/crawl does it already exist ?

      Use absolute folder path, such as /Users/XXX/Desktop/XXX/HW2_WebCrawler/data/crawl

    3. If you didn’t get any result

      visit the website on your browser first