Chilkat HOME .NET Core C# Android™ AutoIt C C# C++ Chilkat2-Python CkPython Classic ASP DataFlex Delphi ActiveX Delphi DLL Go Java Lianja Mono C# Node.js Objective-C PHP ActiveX PHP Extension Perl PowerBuilder PowerShell PureBasic Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ VB.NET VBScript Visual Basic 6.0 Visual FoxPro Xojo Plugin
(Delphi ActiveX) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
uses Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics, Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls, Chilkat_TLB; ... procedure TForm1.Button1Click(Sender: TObject); var spider: TChilkatSpider; seenDomains: TCkStringArray; seedUrls: TCkStringArray; url: WideString; domain: WideString; i: Integer; success: Integer; domain: WideString; baseDomain: WideString; begin spider := TChilkatSpider.Create(Self); seenDomains := TCkStringArray.Create(Self); seedUrls := TCkStringArray.Create(Self); seenDomains.Unique := 1; seedUrls.Unique := 1; // You will need to change the start URL to something else... seedUrls.Append('http://something.whateverYouWant.com/'); // Set outbound URL exclude patterns // URLs matching any of these patterns will not be added to the // collection of outbound links. spider.AddAvoidOutboundLinkPattern('*?id=*'); spider.AddAvoidOutboundLinkPattern('*.mypages.*'); spider.AddAvoidOutboundLinkPattern('*.personal.*'); spider.AddAvoidOutboundLinkPattern('*.comcast.*'); spider.AddAvoidOutboundLinkPattern('*.aol.*'); spider.AddAvoidOutboundLinkPattern('*~*'); // Use a cache so we don't have to re-fetch URLs previously fetched. spider.CacheDir := 'c:/spiderCache/'; spider.FetchFromCache := 1; spider.UpdateCache := 1; while seedUrls.Count > 0 do begin url := seedUrls.Pop(); spider.Initialize(url); // Spider 5 URLs of this domain. // but first, save the base domain in seenDomains domain := spider.GetUrlDomain(url); seenDomains.Append(spider.GetBaseDomain(domain)); for i := 0 to 4 do begin success := spider.CrawlNext(); if (success = 1) then begin // Display the URL we just crawled. Memo1.Lines.Add(spider.LastUrl); // If the last URL was retrieved from cache, // we won't wait. Otherwise we'll wait 1 second // before fetching the next URL. if (spider.LastFromCache <> 1) then begin spider.SleepMs(1000); end; end else begin // cause the loop to exit.. i := 999; end; end; // Add the outbound links to seedUrls, except // for the domains we've already seen. for i := 0 to spider.NumOutboundLinks - 1 do begin url := spider.GetOutboundLink(i); domain := spider.GetUrlDomain(url); baseDomain := spider.GetBaseDomain(domain); if (seenDomains.Contains(baseDomain) = 0) then begin // Don't let our list of seedUrls grow too large. if (seedUrls.Count < 1000) then begin seedUrls.Append(url); end; end; end; end; end; |
© 2000-2024 Chilkat Software, Inc. All Rights Reserved.