Classic ASP
Classic ASP
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat Classic ASP Downloads
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<%
success = 0
set spider = Server.CreateObject("Chilkat.Spider")
set seenDomains = Server.CreateObject("Chilkat.StringArray")
set seedUrls = Server.CreateObject("Chilkat.StringArray")
seenDomains.Unique = 1
seedUrls.Unique = 1
' You will need to change the start URL to something else...
success = seedUrls.Append("http://something.whateverYouWant.com/")
' Set outbound URL exclude patterns
' URLs matching any of these patterns will not be added to the
' collection of outbound links.
spider.AddAvoidOutboundLinkPattern "*?id=*"
spider.AddAvoidOutboundLinkPattern "*.mypages.*"
spider.AddAvoidOutboundLinkPattern "*.personal.*"
spider.AddAvoidOutboundLinkPattern "*.comcast.*"
spider.AddAvoidOutboundLinkPattern "*.aol.*"
spider.AddAvoidOutboundLinkPattern "*~*"
' Use a cache so we don't have to re-fetch URLs previously fetched.
spider.CacheDir = "c:/spiderCache/"
spider.FetchFromCache = 1
spider.UpdateCache = 1
Do While seedUrls.Count > 0
url = seedUrls.Pop()
spider.Initialize url
' Spider 5 URLs of this domain.
' but first, save the base domain in seenDomains
domain = spider.GetUrlDomain(url)
success = seenDomains.Append(spider.GetBaseDomain(domain))
For i = 0 To 4
success = spider.CrawlNext()
If (success = 1) Then
' Display the URL we just crawled.
Response.Write "<pre>" & Server.HTMLEncode( spider.LastUrl) & "</pre>"
' If the last URL was retrieved from cache,
' we won't wait. Otherwise we'll wait 1 second
' before fetching the next URL.
If (spider.LastFromCache <> 1) Then
spider.SleepMs 1000
End If
Else
' cause the loop to exit..
i = 999
End If
Next
' Add the outbound links to seedUrls, except
' for the domains we've already seen.
For i = 0 To spider.NumOutboundLinks - 1
url = spider.GetOutboundLink(i)
domain = spider.GetUrlDomain(url)
baseDomain = spider.GetBaseDomain(domain)
If (seenDomains.Contains(baseDomain) = 0) Then
' Don't let our list of seedUrls grow too large.
If (seedUrls.Count < 1000) Then
success = seedUrls.Append(url)
End If
End If
Next
Loop
%>
</body>
</html>