Delphi ActiveX
A Simple Web Crawler

This demonstrates a very simple web crawler using the Chilkat Spider component.
Chilkat Delphi ActiveX Downloads

Delphi ActiveX
uses
    Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
    Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls, Chilkat_TLB;

...

procedure TForm1.Button1Click(Sender: TObject);
var
success: Integer;
spider: TChilkatSpider;
seenDomains: TCkStringArray;
seedUrls: TCkStringArray;
url: WideString;
domain: WideString;
i: Integer;
domain: WideString;
baseDomain: WideString;

begin
success := 0;

spider := TChilkatSpider.Create(Self);

seenDomains := TCkStringArray.Create(Self);
seedUrls := TCkStringArray.Create(Self);

seenDomains.Unique := 1;
seedUrls.Unique := 1;

// You will need to change the start URL to something else...
seedUrls.Append('http://something.whateverYouWant.com/');

// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the 
// collection of outbound links.
spider.AddAvoidOutboundLinkPattern('*?id=*');
spider.AddAvoidOutboundLinkPattern('*.mypages.*');
spider.AddAvoidOutboundLinkPattern('*.personal.*');
spider.AddAvoidOutboundLinkPattern('*.comcast.*');
spider.AddAvoidOutboundLinkPattern('*.aol.*');
spider.AddAvoidOutboundLinkPattern('*~*');

// Use a cache so we don't have to re-fetch URLs previously fetched.
spider.CacheDir := 'c:/spiderCache/';
spider.FetchFromCache := 1;
spider.UpdateCache := 1;

while seedUrls.Count > 0 do
  begin

    url := seedUrls.Pop();
    spider.Initialize(url);

    // Spider 5 URLs of this domain.
    // but first, save the base domain in seenDomains
    domain := spider.GetUrlDomain(url);
    seenDomains.Append(spider.GetBaseDomain(domain));

    for i := 0 to 4 do
      begin
        success := spider.CrawlNext();
        if (success = 1) then
          begin

            // Display the URL we just crawled.
            Memo1.Lines.Add(spider.LastUrl);

            // If the last URL was retrieved from cache,
            // we won't wait.  Otherwise we'll wait 1 second
            // before fetching the next URL.
            if (spider.LastFromCache <> 1) then
              begin
                spider.SleepMs(1000);
              end;

          end
        else
          begin
            // cause the loop to exit..
            i := 999;
          end;

      end;

    // Add the outbound links to seedUrls, except
    // for the domains we've already seen.
    for i := 0 to spider.NumOutboundLinks - 1 do
      begin

        url := spider.GetOutboundLink(i);
        domain := spider.GetUrlDomain(url);
        baseDomain := spider.GetBaseDomain(domain);
        if (seenDomains.Contains(baseDomain) = 0) then
          begin
            // Don't let our list of seedUrls grow too large.
            if (seedUrls.Count < 1000) then
              begin
                seedUrls.Append(url);
              end;
          end;

      end;

  end;
end;