Chilkat HOME .NET Core C# Android™ AutoIt C C# C++ Chilkat2-Python CkPython Classic ASP DataFlex Delphi ActiveX Delphi DLL Go Java Lianja Mono C# Node.js Objective-C PHP ActiveX PHP Extension Perl PowerBuilder PowerShell PureBasic Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ VB.NET VBScript Visual Basic 6.0 Visual FoxPro Xojo Plugin
(Delphi DLL) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
uses Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics, Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls, StringArray, Spider; ... procedure TForm1.Button1Click(Sender: TObject); var spider: HCkSpider; seenDomains: HCkStringArray; seedUrls: HCkStringArray; url: PWideChar; domain: PWideChar; i: Integer; success: Boolean; domain: PWideChar; baseDomain: PWideChar; begin spider := CkSpider_Create(); seenDomains := CkStringArray_Create(); seedUrls := CkStringArray_Create(); CkStringArray_putUnique(seenDomains,True); CkStringArray_putUnique(seedUrls,True); // You will need to change the start URL to something else... CkStringArray_Append(seedUrls,'http://something.whateverYouWant.com/'); // Set outbound URL exclude patterns // URLs matching any of these patterns will not be added to the // collection of outbound links. CkSpider_AddAvoidOutboundLinkPattern(spider,'*?id=*'); CkSpider_AddAvoidOutboundLinkPattern(spider,'*.mypages.*'); CkSpider_AddAvoidOutboundLinkPattern(spider,'*.personal.*'); CkSpider_AddAvoidOutboundLinkPattern(spider,'*.comcast.*'); CkSpider_AddAvoidOutboundLinkPattern(spider,'*.aol.*'); CkSpider_AddAvoidOutboundLinkPattern(spider,'*~*'); // Use a cache so we don't have to re-fetch URLs previously fetched. CkSpider_putCacheDir(spider,'c:/spiderCache/'); CkSpider_putFetchFromCache(spider,True); CkSpider_putUpdateCache(spider,True); while CkStringArray_getCount(seedUrls) > 0 do begin url := CkStringArray__pop(seedUrls); CkSpider_Initialize(spider,url); // Spider 5 URLs of this domain. // but first, save the base domain in seenDomains domain := CkSpider__getUrlDomain(spider,url); CkStringArray_Append(seenDomains,CkSpider__getBaseDomain(spider,domain)); for i := 0 to 4 do begin success := CkSpider_CrawlNext(spider); if (success = True) then begin // Display the URL we just crawled. Memo1.Lines.Add(CkSpider__lastUrl(spider)); // If the last URL was retrieved from cache, // we won't wait. Otherwise we'll wait 1 second // before fetching the next URL. if (CkSpider_getLastFromCache(spider) <> True) then begin CkSpider_SleepMs(spider,1000); end; end else begin // cause the loop to exit.. i := 999; end; end; // Add the outbound links to seedUrls, except // for the domains we've already seen. for i := 0 to CkSpider_getNumOutboundLinks(spider) - 1 do begin url := CkSpider__getOutboundLink(spider,i); domain := CkSpider__getUrlDomain(spider,url); baseDomain := CkSpider__getBaseDomain(spider,domain); if (CkStringArray_Contains(seenDomains,baseDomain) = False) then begin // Don't let our list of seedUrls grow too large. if (CkStringArray_getCount(seedUrls) < 1000) then begin CkStringArray_Append(seedUrls,url); end; end; end; end; CkSpider_Dispose(spider); CkStringArray_Dispose(seenDomains); CkStringArray_Dispose(seedUrls); end; |
© 2000-2024 Chilkat Software, Inc. All Rights Reserved.