Sample code for 30+ languages & platforms
Classic ASP

A Simple Web Crawler

See more Spider Examples

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat Classic ASP Downloads

Classic ASP
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<%
success = 0

set spider = Server.CreateObject("Chilkat.Spider")

set seenDomains = Server.CreateObject("Chilkat.StringArray")
set seedUrls = Server.CreateObject("Chilkat.StringArray")

seenDomains.Unique = 1
seedUrls.Unique = 1

' You will need to change the start URL to something else...
success = seedUrls.Append("http://something.whateverYouWant.com/")

' Set outbound URL exclude patterns
' URLs matching any of these patterns will not be added to the 
' collection of outbound links.
spider.AddAvoidOutboundLinkPattern "*?id=*"
spider.AddAvoidOutboundLinkPattern "*.mypages.*"
spider.AddAvoidOutboundLinkPattern "*.personal.*"
spider.AddAvoidOutboundLinkPattern "*.comcast.*"
spider.AddAvoidOutboundLinkPattern "*.aol.*"
spider.AddAvoidOutboundLinkPattern "*~*"

' Use a cache so we don't have to re-fetch URLs previously fetched.
spider.CacheDir = "c:/spiderCache/"
spider.FetchFromCache = 1
spider.UpdateCache = 1

Do While seedUrls.Count > 0

    url = seedUrls.Pop()
    spider.Initialize url

    ' Spider 5 URLs of this domain.
    ' but first, save the base domain in seenDomains
    domain = spider.GetUrlDomain(url)
    success = seenDomains.Append(spider.GetBaseDomain(domain))

    For i = 0 To 4
        success = spider.CrawlNext()
        If (success = 1) Then

            ' Display the URL we just crawled.
            Response.Write "<pre>" & Server.HTMLEncode( spider.LastUrl) & "</pre>"

            ' If the last URL was retrieved from cache,
            ' we won't wait.  Otherwise we'll wait 1 second
            ' before fetching the next URL.
            If (spider.LastFromCache <> 1) Then
                spider.SleepMs 1000
            End If

        Else
            ' cause the loop to exit..
            i = 999
        End If

    Next

    ' Add the outbound links to seedUrls, except
    ' for the domains we've already seen.
    For i = 0 To spider.NumOutboundLinks - 1

        url = spider.GetOutboundLink(i)
        domain = spider.GetUrlDomain(url)
        baseDomain = spider.GetBaseDomain(domain)
        If (seenDomains.Contains(baseDomain) = 0) Then
            ' Don't let our list of seedUrls grow too large.
            If (seedUrls.Count < 1000) Then
                success = seedUrls.Append(url)
            End If

        End If

    Next

Loop

%>
</body>
</html>