Sample code for 30+ languages & platforms
C

A Simple Web Crawler

See more Spider Examples

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat C Downloads

C
#include <C_CkSpider.h>
#include <C_CkStringArray.h>

void ChilkatSample(void)
    {
    BOOL success;
    HCkSpider spider;
    HCkStringArray seenDomains;
    HCkStringArray seedUrls;
    const char *url;
    const char *domain;
    int i;
    const char *domain;
    const char *baseDomain;

    success = FALSE;

    spider = CkSpider_Create();

    seenDomains = CkStringArray_Create();
    seedUrls = CkStringArray_Create();

    CkStringArray_putUnique(seenDomains,TRUE);
    CkStringArray_putUnique(seedUrls,TRUE);

    // You will need to change the start URL to something else...
    CkStringArray_Append(seedUrls,"http://something.whateverYouWant.com/");

    // Set outbound URL exclude patterns
    // URLs matching any of these patterns will not be added to the 
    // collection of outbound links.
    CkSpider_AddAvoidOutboundLinkPattern(spider,"*?id=*");
    CkSpider_AddAvoidOutboundLinkPattern(spider,"*.mypages.*");
    CkSpider_AddAvoidOutboundLinkPattern(spider,"*.personal.*");
    CkSpider_AddAvoidOutboundLinkPattern(spider,"*.comcast.*");
    CkSpider_AddAvoidOutboundLinkPattern(spider,"*.aol.*");
    CkSpider_AddAvoidOutboundLinkPattern(spider,"*~*");

    // Use a cache so we don't have to re-fetch URLs previously fetched.
    CkSpider_putCacheDir(spider,"c:/spiderCache/");
    CkSpider_putFetchFromCache(spider,TRUE);
    CkSpider_putUpdateCache(spider,TRUE);

    while (CkStringArray_getCount(seedUrls) > 0) {

        url = CkStringArray_pop(seedUrls);
        CkSpider_Initialize(spider,url);

        // Spider 5 URLs of this domain.
        // but first, save the base domain in seenDomains
        domain = CkSpider_getUrlDomain(spider,url);
        CkStringArray_Append(seenDomains,CkSpider_getBaseDomain(spider,domain));

        for (i = 0; i <= 4; i++) {
            success = CkSpider_CrawlNext(spider);
            if (success == TRUE) {

                // Display the URL we just crawled.
                printf("%s\n",CkSpider_lastUrl(spider));

                // If the last URL was retrieved from cache,
                // we won't wait.  Otherwise we'll wait 1 second
                // before fetching the next URL.
                if (CkSpider_getLastFromCache(spider) != TRUE) {
                    CkSpider_SleepMs(spider,1000);
                }

            }
            else {
                // cause the loop to exit..
                i = 999;
            }

        }

        // Add the outbound links to seedUrls, except
        // for the domains we've already seen.
        for (i = 0; i <= CkSpider_getNumOutboundLinks(spider) - 1; i++) {

            url = CkSpider_getOutboundLink(spider,i);
            domain = CkSpider_getUrlDomain(spider,url);
            baseDomain = CkSpider_getBaseDomain(spider,domain);
            if (CkStringArray_Contains(seenDomains,baseDomain) == FALSE) {
                // Don't let our list of seedUrls grow too large.
                if (CkStringArray_getCount(seedUrls) < 1000) {
                    CkStringArray_Append(seedUrls,url);
                }

            }

        }

    }



    CkSpider_Dispose(spider);
    CkStringArray_Dispose(seenDomains);
    CkStringArray_Dispose(seedUrls);

    }