Chilkat HOME .NET Core C# Android™ AutoIt C C# C++ Chilkat2-Python CkPython Classic ASP DataFlex Delphi ActiveX Delphi DLL Go Java Lianja Mono C# Node.js Objective-C PHP ActiveX PHP Extension Perl PowerBuilder PowerShell PureBasic Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ VB.NET VBScript Visual Basic 6.0 Visual FoxPro Xojo Plugin
(Lianja) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
loSpider = createobject("CkSpider") loSeenDomains = createobject("CkStringArray") loSeedUrls = createobject("CkStringArray") loSeenDomains.Unique = .T. loSeedUrls.Unique = .T. // You will need to change the start URL to something else... loSeedUrls.Append("http://something.whateverYouWant.com/") // Set outbound URL exclude patterns // URLs matching any of these patterns will not be added to the // collection of outbound links. loSpider.AddAvoidOutboundLinkPattern("*?id=*") loSpider.AddAvoidOutboundLinkPattern("*.mypages.*") loSpider.AddAvoidOutboundLinkPattern("*.personal.*") loSpider.AddAvoidOutboundLinkPattern("*.comcast.*") loSpider.AddAvoidOutboundLinkPattern("*.aol.*") loSpider.AddAvoidOutboundLinkPattern("*~*") // Use a cache so we don't have to re-fetch URLs previously fetched. loSpider.CacheDir = "c:/spiderCache/" loSpider.FetchFromCache = .T. loSpider.UpdateCache = .T. do while loSeedUrls.Count > 0 lcUrl = loSeedUrls.Pop() loSpider.Initialize(lcUrl) // Spider 5 URLs of this domain. // but first, save the base domain in seenDomains lcDomain = loSpider.GetUrlDomain(lcUrl) loSeenDomains.Append(loSpider.GetBaseDomain(lcDomain)) for i = 0 to 4 llSuccess = loSpider.CrawlNext() if (llSuccess = .T.) then // Display the URL we just crawled. ? loSpider.LastUrl // If the last URL was retrieved from cache, // we won't wait. Otherwise we'll wait 1 second // before fetching the next URL. if (loSpider.LastFromCache <> .T.) then loSpider.SleepMs(1000) endif else // cause the loop to exit.. i = 999 endif next // Add the outbound links to seedUrls, except // for the domains we've already seen. for i = 0 to loSpider.NumOutboundLinks - 1 lcUrl = loSpider.GetOutboundLink(i) lcDomain = loSpider.GetUrlDomain(lcUrl) lcBaseDomain = loSpider.GetBaseDomain(lcDomain) if (loSeenDomains.Contains(lcBaseDomain) = .F.) then // Don't let our list of seedUrls grow too large. if (loSeedUrls.Count < 1000) then loSeedUrls.Append(lcUrl) endif endif next enddo release loSpider release loSeenDomains release loSeedUrls |
© 2000-2024 Chilkat Software, Inc. All Rights Reserved.