(Ruby) A Simple Web Crawler

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat Ruby Downloads

install from rubygems.org
gem install chilkat
or download... Ruby Library for Windows, MacOS, Linux, Alpine Linux

require 'chilkat'

spider = Chilkat::CkSpider.new()

seenDomains = Chilkat::CkStringArray.new()
seedUrls = Chilkat::CkStringArray.new()

seenDomains.put_Unique(true)
seedUrls.put_Unique(true)

# You will need to change the start URL to something else...
seedUrls.Append("http://something.whateverYouWant.com/")

# Set outbound URL exclude patterns
# URLs matching any of these patterns will not be added to the 
# collection of outbound links.
spider.AddAvoidOutboundLinkPattern("*?id=*")
spider.AddAvoidOutboundLinkPattern("*.mypages.*")
spider.AddAvoidOutboundLinkPattern("*.personal.*")
spider.AddAvoidOutboundLinkPattern("*.comcast.*")
spider.AddAvoidOutboundLinkPattern("*.aol.*")
spider.AddAvoidOutboundLinkPattern("*~*")

# Use a cache so we don't have to re-fetch URLs previously fetched.
spider.put_CacheDir("c:/spiderCache/")
spider.put_FetchFromCache(true)
spider.put_UpdateCache(true)

while seedUrls.get_Count() > 0

    url = seedUrls.pop()
    spider.Initialize(url)

    # Spider 5 URLs of this domain.
    # but first, save the base domain in seenDomains
    domain = spider.getUrlDomain(url)
    seenDomains.Append(spider.getBaseDomain(domain))

    for i in 0 .. 4
        success = spider.CrawlNext()
        if (success == true)

            # Display the URL we just crawled.
            print spider.lastUrl() + "\n";

            # If the last URL was retrieved from cache,
            # we won't wait.  Otherwise we'll wait 1 second
            # before fetching the next URL.
            if (spider.get_LastFromCache() != true)
                spider.SleepMs(1000)
            end

        else
            # cause the loop to exit..
            i = 999
        end

    end

    # Add the outbound links to seedUrls, except
    # for the domains we've already seen.
    for i in 0 .. spider.get_NumOutboundLinks() - 1

        url = spider.getOutboundLink(i)
        domain = spider.getUrlDomain(url)
        baseDomain = spider.getBaseDomain(domain)
        if (seenDomains.Contains(baseDomain) == false)
            # Don't let our list of seedUrls grow too large.
            if (seedUrls.get_Count() < 1000)
                seedUrls.Append(url)
            end

        end

    end

end