Chilkat HOME .NET Core C# Android™ AutoIt C C# C++ Chilkat2-Python CkPython Classic ASP DataFlex Delphi ActiveX Delphi DLL Go Java Lianja Mono C# Node.js Objective-C PHP ActiveX PHP Extension Perl PowerBuilder PowerShell PureBasic Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ VB.NET VBScript Visual Basic 6.0 Visual FoxPro Xojo Plugin
(PowerBuilder) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
integer li_rc oleobject loo_Spider oleobject loo_SeenDomains oleobject loo_SeedUrls string ls_Url string ls_Domain integer i integer li_Success string ls_Domain string ls_BaseDomain loo_Spider = create oleobject // Use "Chilkat_9_5_0.Spider" for versions of Chilkat < 10.0.0 li_rc = loo_Spider.ConnectToNewObject("Chilkat.Spider") if li_rc < 0 then destroy loo_Spider MessageBox("Error","Connecting to COM object failed") return end if loo_SeenDomains = create oleobject // Use "Chilkat_9_5_0.StringArray" for versions of Chilkat < 10.0.0 li_rc = loo_SeenDomains.ConnectToNewObject("Chilkat.StringArray") loo_SeedUrls = create oleobject // Use "Chilkat_9_5_0.StringArray" for versions of Chilkat < 10.0.0 li_rc = loo_SeedUrls.ConnectToNewObject("Chilkat.StringArray") loo_SeenDomains.Unique = 1 loo_SeedUrls.Unique = 1 // You will need to change the start URL to something else... loo_SeedUrls.Append("http://something.whateverYouWant.com/") // Set outbound URL exclude patterns // URLs matching any of these patterns will not be added to the // collection of outbound links. loo_Spider.AddAvoidOutboundLinkPattern("*?id=*") loo_Spider.AddAvoidOutboundLinkPattern("*.mypages.*") loo_Spider.AddAvoidOutboundLinkPattern("*.personal.*") loo_Spider.AddAvoidOutboundLinkPattern("*.comcast.*") loo_Spider.AddAvoidOutboundLinkPattern("*.aol.*") loo_Spider.AddAvoidOutboundLinkPattern("*~~*") // Use a cache so we don't have to re-fetch URLs previously fetched. loo_Spider.CacheDir = "c:/spiderCache/" loo_Spider.FetchFromCache = 1 loo_Spider.UpdateCache = 1 do while loo_SeedUrls.Count > 0 ls_Url = loo_SeedUrls.Pop() loo_Spider.Initialize(ls_Url) // Spider 5 URLs of this domain. // but first, save the base domain in seenDomains ls_Domain = loo_Spider.GetUrlDomain(ls_Url) loo_SeenDomains.Append(loo_Spider.GetBaseDomain(ls_Domain)) for i = 0 to 4 li_Success = loo_Spider.CrawlNext() if li_Success = 1 then // Display the URL we just crawled. Write-Debug loo_Spider.LastUrl // If the last URL was retrieved from cache, // we won't wait. Otherwise we'll wait 1 second // before fetching the next URL. if loo_Spider.LastFromCache <> 1 then loo_Spider.SleepMs(1000) end if else // cause the loop to exit.. i = 999 end if next // Add the outbound links to seedUrls, except // for the domains we've already seen. for i = 0 to loo_Spider.NumOutboundLinks - 1 ls_Url = loo_Spider.GetOutboundLink(i) ls_Domain = loo_Spider.GetUrlDomain(ls_Url) ls_BaseDomain = loo_Spider.GetBaseDomain(ls_Domain) if loo_SeenDomains.Contains(ls_BaseDomain) = 0 then // Don't let our list of seedUrls grow too large. if loo_SeedUrls.Count < 1000 then loo_SeedUrls.Append(ls_Url) end if end if next loop destroy loo_Spider destroy loo_SeenDomains destroy loo_SeedUrls |
© 2000-2024 Chilkat Software, Inc. All Rights Reserved.