Chilkat HOME .NET Core C# Android™ AutoIt C C# C++ Chilkat2-Python CkPython Classic ASP DataFlex Delphi ActiveX Delphi DLL Go Java Lianja Mono C# Node.js Objective-C PHP ActiveX PHP Extension Perl PowerBuilder PowerShell PureBasic Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ VB.NET VBScript Visual Basic 6.0 Visual FoxPro Xojo Plugin
(Unicode C) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
#include <C_CkSpiderW.h> #include <C_CkStringArrayW.h> void ChilkatSample(void) { HCkSpiderW spider; HCkStringArrayW seenDomains; HCkStringArrayW seedUrls; const wchar_t *url; const wchar_t *domain; int i; BOOL success; const wchar_t *domain; const wchar_t *baseDomain; spider = CkSpiderW_Create(); seenDomains = CkStringArrayW_Create(); seedUrls = CkStringArrayW_Create(); CkStringArrayW_putUnique(seenDomains,TRUE); CkStringArrayW_putUnique(seedUrls,TRUE); // You will need to change the start URL to something else... CkStringArrayW_Append(seedUrls,L"http://something.whateverYouWant.com/"); // Set outbound URL exclude patterns // URLs matching any of these patterns will not be added to the // collection of outbound links. CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*?id=*"); CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*.mypages.*"); CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*.personal.*"); CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*.comcast.*"); CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*.aol.*"); CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*~*"); // Use a cache so we don't have to re-fetch URLs previously fetched. CkSpiderW_putCacheDir(spider,L"c:/spiderCache/"); CkSpiderW_putFetchFromCache(spider,TRUE); CkSpiderW_putUpdateCache(spider,TRUE); while (CkStringArrayW_getCount(seedUrls) > 0) { url = CkStringArrayW_pop(seedUrls); CkSpiderW_Initialize(spider,url); // Spider 5 URLs of this domain. // but first, save the base domain in seenDomains domain = CkSpiderW_getUrlDomain(spider,url); CkStringArrayW_Append(seenDomains,CkSpiderW_getBaseDomain(spider,domain)); for (i = 0; i <= 4; i++) { success = CkSpiderW_CrawlNext(spider); if (success == TRUE) { // Display the URL we just crawled. wprintf(L"%s\n",CkSpiderW_lastUrl(spider)); // If the last URL was retrieved from cache, // we won't wait. Otherwise we'll wait 1 second // before fetching the next URL. if (CkSpiderW_getLastFromCache(spider) != TRUE) { CkSpiderW_SleepMs(spider,1000); } } else { // cause the loop to exit.. i = 999; } } // Add the outbound links to seedUrls, except // for the domains we've already seen. for (i = 0; i <= CkSpiderW_getNumOutboundLinks(spider) - 1; i++) { url = CkSpiderW_getOutboundLink(spider,i); domain = CkSpiderW_getUrlDomain(spider,url); baseDomain = CkSpiderW_getBaseDomain(spider,domain); if (CkStringArrayW_Contains(seenDomains,baseDomain) == FALSE) { // Don't let our list of seedUrls grow too large. if (CkStringArrayW_getCount(seedUrls) < 1000) { CkStringArrayW_Append(seedUrls,url); } } } } CkSpiderW_Dispose(spider); CkStringArrayW_Dispose(seenDomains); CkStringArrayW_Dispose(seedUrls); } |
© 2000-2024 Chilkat Software, Inc. All Rights Reserved.