C
C
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat C Downloads
#include <C_CkSpider.h>
#include <C_CkStringArray.h>
void ChilkatSample(void)
{
BOOL success;
HCkSpider spider;
HCkStringArray seenDomains;
HCkStringArray seedUrls;
const char *url;
const char *domain;
int i;
const char *domain;
const char *baseDomain;
success = FALSE;
spider = CkSpider_Create();
seenDomains = CkStringArray_Create();
seedUrls = CkStringArray_Create();
CkStringArray_putUnique(seenDomains,TRUE);
CkStringArray_putUnique(seedUrls,TRUE);
// You will need to change the start URL to something else...
CkStringArray_Append(seedUrls,"http://something.whateverYouWant.com/");
// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the
// collection of outbound links.
CkSpider_AddAvoidOutboundLinkPattern(spider,"*?id=*");
CkSpider_AddAvoidOutboundLinkPattern(spider,"*.mypages.*");
CkSpider_AddAvoidOutboundLinkPattern(spider,"*.personal.*");
CkSpider_AddAvoidOutboundLinkPattern(spider,"*.comcast.*");
CkSpider_AddAvoidOutboundLinkPattern(spider,"*.aol.*");
CkSpider_AddAvoidOutboundLinkPattern(spider,"*~*");
// Use a cache so we don't have to re-fetch URLs previously fetched.
CkSpider_putCacheDir(spider,"c:/spiderCache/");
CkSpider_putFetchFromCache(spider,TRUE);
CkSpider_putUpdateCache(spider,TRUE);
while (CkStringArray_getCount(seedUrls) > 0) {
url = CkStringArray_pop(seedUrls);
CkSpider_Initialize(spider,url);
// Spider 5 URLs of this domain.
// but first, save the base domain in seenDomains
domain = CkSpider_getUrlDomain(spider,url);
CkStringArray_Append(seenDomains,CkSpider_getBaseDomain(spider,domain));
for (i = 0; i <= 4; i++) {
success = CkSpider_CrawlNext(spider);
if (success == TRUE) {
// Display the URL we just crawled.
printf("%s\n",CkSpider_lastUrl(spider));
// If the last URL was retrieved from cache,
// we won't wait. Otherwise we'll wait 1 second
// before fetching the next URL.
if (CkSpider_getLastFromCache(spider) != TRUE) {
CkSpider_SleepMs(spider,1000);
}
}
else {
// cause the loop to exit..
i = 999;
}
}
// Add the outbound links to seedUrls, except
// for the domains we've already seen.
for (i = 0; i <= CkSpider_getNumOutboundLinks(spider) - 1; i++) {
url = CkSpider_getOutboundLink(spider,i);
domain = CkSpider_getUrlDomain(spider,url);
baseDomain = CkSpider_getBaseDomain(spider,domain);
if (CkStringArray_Contains(seenDomains,baseDomain) == FALSE) {
// Don't let our list of seedUrls grow too large.
if (CkStringArray_getCount(seedUrls) < 1000) {
CkStringArray_Append(seedUrls,url);
}
}
}
}
CkSpider_Dispose(spider);
CkStringArray_Dispose(seenDomains);
CkStringArray_Dispose(seedUrls);
}