Delphi DLL
Delphi DLL
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat Delphi DLL Downloads
uses
Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls, StringArray, Spider;
...
procedure TForm1.Button1Click(Sender: TObject);
var
success: Boolean;
spider: HCkSpider;
seenDomains: HCkStringArray;
seedUrls: HCkStringArray;
url: PWideChar;
domain: PWideChar;
i: Integer;
domain: PWideChar;
baseDomain: PWideChar;
begin
success := False;
spider := CkSpider_Create();
seenDomains := CkStringArray_Create();
seedUrls := CkStringArray_Create();
CkStringArray_putUnique(seenDomains,True);
CkStringArray_putUnique(seedUrls,True);
// You will need to change the start URL to something else...
CkStringArray_Append(seedUrls,'http://something.whateverYouWant.com/');
// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the
// collection of outbound links.
CkSpider_AddAvoidOutboundLinkPattern(spider,'*?id=*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*.mypages.*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*.personal.*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*.comcast.*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*.aol.*');
CkSpider_AddAvoidOutboundLinkPattern(spider,'*~*');
// Use a cache so we don't have to re-fetch URLs previously fetched.
CkSpider_putCacheDir(spider,'c:/spiderCache/');
CkSpider_putFetchFromCache(spider,True);
CkSpider_putUpdateCache(spider,True);
while CkStringArray_getCount(seedUrls) > 0 do
begin
url := CkStringArray__pop(seedUrls);
CkSpider_Initialize(spider,url);
// Spider 5 URLs of this domain.
// but first, save the base domain in seenDomains
domain := CkSpider__getUrlDomain(spider,url);
CkStringArray_Append(seenDomains,CkSpider__getBaseDomain(spider,domain));
for i := 0 to 4 do
begin
success := CkSpider_CrawlNext(spider);
if (success = True) then
begin
// Display the URL we just crawled.
Memo1.Lines.Add(CkSpider__lastUrl(spider));
// If the last URL was retrieved from cache,
// we won't wait. Otherwise we'll wait 1 second
// before fetching the next URL.
if (CkSpider_getLastFromCache(spider) <> True) then
begin
CkSpider_SleepMs(spider,1000);
end;
end
else
begin
// cause the loop to exit..
i := 999;
end;
end;
// Add the outbound links to seedUrls, except
// for the domains we've already seen.
for i := 0 to CkSpider_getNumOutboundLinks(spider) - 1 do
begin
url := CkSpider__getOutboundLink(spider,i);
domain := CkSpider__getUrlDomain(spider,url);
baseDomain := CkSpider__getBaseDomain(spider,domain);
if (CkStringArray_Contains(seenDomains,baseDomain) = False) then
begin
// Don't let our list of seedUrls grow too large.
if (CkStringArray_getCount(seedUrls) < 1000) then
begin
CkStringArray_Append(seedUrls,url);
end;
end;
end;
end;
CkSpider_Dispose(spider);
CkStringArray_Dispose(seenDomains);
CkStringArray_Dispose(seedUrls);
end;