Sample code for 30+ languages & platforms
Delphi DLL

A Simple Web Crawler

See more Spider Examples

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat Delphi DLL Downloads

Delphi DLL
uses
    Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
    Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls, StringArray, Spider;

...

procedure TForm1.Button1Click(Sender: TObject);
var
success: Boolean;
spider: HCkSpider;
seenDomains: HCkStringArray;
seedUrls: HCkStringArray;
url: PWideChar;
domain: PWideChar;
i: Integer;
domain: PWideChar;
baseDomain: PWideChar;

begin
success := False;

spider := CkSpider_Create();

seenDomains := CkStringArray_Create();
seedUrls := CkStringArray_Create();

CkStringArray_putUnique(seenDomains,True);
CkStringArray_putUnique(seedUrls,True);

// You will need to change the start URL to something else...
CkStringArray_Append(seedUrls,'http://something.whateverYouWant.com/');

// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the 
// collection of outbound links.
CkSpider_AddAvoidOutboundLinkPattern(spider,'*?id=*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*.mypages.*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*.personal.*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*.comcast.*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*.aol.*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*~*');

// Use a cache so we don't have to re-fetch URLs previously fetched.
CkSpider_putCacheDir(spider,'c:/spiderCache/');
CkSpider_putFetchFromCache(spider,True);
CkSpider_putUpdateCache(spider,True);

while CkStringArray_getCount(seedUrls) > 0 do
  begin

    url := CkStringArray__pop(seedUrls);
    CkSpider_Initialize(spider,url);

    // Spider 5 URLs of this domain.
    // but first, save the base domain in seenDomains
    domain := CkSpider__getUrlDomain(spider,url);
    CkStringArray_Append(seenDomains,CkSpider__getBaseDomain(spider,domain));

    for i := 0 to 4 do
      begin
        success := CkSpider_CrawlNext(spider);
        if (success = True) then
          begin

            // Display the URL we just crawled.
            Memo1.Lines.Add(CkSpider__lastUrl(spider));

            // If the last URL was retrieved from cache,
            // we won't wait.  Otherwise we'll wait 1 second
            // before fetching the next URL.
            if (CkSpider_getLastFromCache(spider) <> True) then
              begin
                CkSpider_SleepMs(spider,1000);
              end;

          end
        else
          begin
            // cause the loop to exit..
            i := 999;
          end;

      end;

    // Add the outbound links to seedUrls, except
    // for the domains we've already seen.
    for i := 0 to CkSpider_getNumOutboundLinks(spider) - 1 do
      begin

        url := CkSpider__getOutboundLink(spider,i);
        domain := CkSpider__getUrlDomain(spider,url);
        baseDomain := CkSpider__getBaseDomain(spider,domain);
        if (CkStringArray_Contains(seenDomains,baseDomain) = False) then
          begin
            // Don't let our list of seedUrls grow too large.
            if (CkStringArray_getCount(seedUrls) < 1000) then
              begin
                CkStringArray_Append(seedUrls,url);
              end;
          end;

      end;

  end;

CkSpider_Dispose(spider);
CkStringArray_Dispose(seenDomains);
CkStringArray_Dispose(seedUrls);

end;