Visual FoxPro
Visual FoxPro
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat Visual FoxPro Downloads
LOCAL lnSuccess
LOCAL loSpider
LOCAL loSeenDomains
LOCAL loSeedUrls
LOCAL lcUrl
LOCAL lcDomain
LOCAL i
LOCAL lcDomain
LOCAL lcBaseDomain
lnSuccess = 0
loSpider = CreateObject('Chilkat.Spider')
loSeenDomains = CreateObject('Chilkat.StringArray')
loSeedUrls = CreateObject('Chilkat.StringArray')
loSeenDomains.Unique = 1
loSeedUrls.Unique = 1
* You will need to change the start URL to something else...
loSeedUrls.Append("http://something.whateverYouWant.com/")
* Set outbound URL exclude patterns
* URLs matching any of these patterns will not be added to the
* collection of outbound links.
loSpider.AddAvoidOutboundLinkPattern("*?id=*")
loSpider.AddAvoidOutboundLinkPattern("*.mypages.*")
loSpider.AddAvoidOutboundLinkPattern("*.personal.*")
loSpider.AddAvoidOutboundLinkPattern("*.comcast.*")
loSpider.AddAvoidOutboundLinkPattern("*.aol.*")
loSpider.AddAvoidOutboundLinkPattern("*~*")
* Use a cache so we don't have to re-fetch URLs previously fetched.
loSpider.CacheDir = "c:/spiderCache/"
loSpider.FetchFromCache = 1
loSpider.UpdateCache = 1
DO WHILE loSeedUrls.Count > 0
lcUrl = loSeedUrls.Pop()
loSpider.Initialize(lcUrl)
* Spider 5 URLs of this domain.
* but first, save the base domain in seenDomains
lcDomain = loSpider.GetUrlDomain(lcUrl)
loSeenDomains.Append(loSpider.GetBaseDomain(lcDomain))
FOR i = 0 TO 4
lnSuccess = loSpider.CrawlNext()
IF (lnSuccess = 1) THEN
* Display the URL we just crawled.
? loSpider.LastUrl
* If the last URL was retrieved from cache,
* we won't wait. Otherwise we'll wait 1 second
* before fetching the next URL.
IF (loSpider.LastFromCache <> 1) THEN
loSpider.SleepMs(1000)
ENDIF
ELSE
* cause the loop to exit..
i = 999
ENDIF
NEXT
* Add the outbound links to seedUrls, except
* for the domains we've already seen.
FOR i = 0 TO loSpider.NumOutboundLinks - 1
lcUrl = loSpider.GetOutboundLink(i)
lcDomain = loSpider.GetUrlDomain(lcUrl)
lcBaseDomain = loSpider.GetBaseDomain(lcDomain)
IF (loSeenDomains.Contains(lcBaseDomain) = 0) THEN
* Don't let our list of seedUrls grow too large.
IF (loSeedUrls.Count < 1000) THEN
loSeedUrls.Append(lcUrl)
ENDIF
ENDIF
NEXT
ENDDO
RELEASE loSpider
RELEASE loSeenDomains
RELEASE loSeedUrls