Sample code for 30+ languages & platforms
Visual FoxPro

A Simple Web Crawler

See more Spider Examples

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat Visual FoxPro Downloads

Visual FoxPro
LOCAL lnSuccess
LOCAL loSpider
LOCAL loSeenDomains
LOCAL loSeedUrls
LOCAL lcUrl
LOCAL lcDomain
LOCAL i
LOCAL lcDomain
LOCAL lcBaseDomain

lnSuccess = 0

loSpider = CreateObject('Chilkat.Spider')

loSeenDomains = CreateObject('Chilkat.StringArray')
loSeedUrls = CreateObject('Chilkat.StringArray')

loSeenDomains.Unique = 1
loSeedUrls.Unique = 1

* You will need to change the start URL to something else...
loSeedUrls.Append("http://something.whateverYouWant.com/")

* Set outbound URL exclude patterns
* URLs matching any of these patterns will not be added to the 
* collection of outbound links.
loSpider.AddAvoidOutboundLinkPattern("*?id=*")
loSpider.AddAvoidOutboundLinkPattern("*.mypages.*")
loSpider.AddAvoidOutboundLinkPattern("*.personal.*")
loSpider.AddAvoidOutboundLinkPattern("*.comcast.*")
loSpider.AddAvoidOutboundLinkPattern("*.aol.*")
loSpider.AddAvoidOutboundLinkPattern("*~*")

* Use a cache so we don't have to re-fetch URLs previously fetched.
loSpider.CacheDir = "c:/spiderCache/"
loSpider.FetchFromCache = 1
loSpider.UpdateCache = 1

DO WHILE loSeedUrls.Count > 0

    lcUrl = loSeedUrls.Pop()
    loSpider.Initialize(lcUrl)

    * Spider 5 URLs of this domain.
    * but first, save the base domain in seenDomains
    lcDomain = loSpider.GetUrlDomain(lcUrl)
    loSeenDomains.Append(loSpider.GetBaseDomain(lcDomain))

    FOR i = 0 TO 4
        lnSuccess = loSpider.CrawlNext()
        IF (lnSuccess = 1) THEN

            * Display the URL we just crawled.
            ? loSpider.LastUrl

            * If the last URL was retrieved from cache,
            * we won't wait.  Otherwise we'll wait 1 second
            * before fetching the next URL.
            IF (loSpider.LastFromCache <> 1) THEN
                loSpider.SleepMs(1000)
            ENDIF

        ELSE
            * cause the loop to exit..
            i = 999
        ENDIF

    NEXT

    * Add the outbound links to seedUrls, except
    * for the domains we've already seen.
    FOR i = 0 TO loSpider.NumOutboundLinks - 1

        lcUrl = loSpider.GetOutboundLink(i)
        lcDomain = loSpider.GetUrlDomain(lcUrl)
        lcBaseDomain = loSpider.GetBaseDomain(lcDomain)
        IF (loSeenDomains.Contains(lcBaseDomain) = 0) THEN
            * Don't let our list of seedUrls grow too large.
            IF (loSeedUrls.Count < 1000) THEN
                loSeedUrls.Append(lcUrl)
            ENDIF

        ENDIF

    NEXT

ENDDO

RELEASE loSpider
RELEASE loSeenDomains
RELEASE loSeedUrls