Chilkat HOME Android™ Classic ASP C C++ C# Mono C# .NET Core C# C# UWP/WinRT DataFlex Delphi ActiveX Delphi DLL Visual FoxPro Java Lianja MFC Objective-C Perl PHP ActiveX PHP Extension PowerBuilder PowerShell PureBasic CkPython Chilkat2-Python Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ Visual Basic 6.0 VB.NET VB.NET UWP/WinRT VBScript Xojo Plugin Node.js Excel Go
(Excel) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
Dim spider As Chilkat.Spider Set spider = Chilkat.NewSpider Dim seenDomains As Chilkat.StringArray Set seenDomains = Chilkat.NewStringArray Dim seedUrls As Chilkat.StringArray Set seedUrls = Chilkat.NewStringArray seenDomains.Unique = True seedUrls.Unique = True ' You will need to change the start URL to something else... Dim success As Boolean success = seedUrls.Append("http://something.whateverYouWant.com/") ' Set outbound URL exclude patterns ' URLs matching any of these patterns will not be added to the ' collection of outbound links. spider.AddAvoidOutboundLinkPattern "*?id=*" spider.AddAvoidOutboundLinkPattern "*.mypages.*" spider.AddAvoidOutboundLinkPattern "*.personal.*" spider.AddAvoidOutboundLinkPattern "*.comcast.*" spider.AddAvoidOutboundLinkPattern "*.aol.*" spider.AddAvoidOutboundLinkPattern "*~*" ' Use a cache so we don't have to re-fetch URLs previously fetched. spider.CacheDir = "c:/spiderCache/" spider.FetchFromCache = True spider.UpdateCache = True Do While seedUrls.Count > 0 url = seedUrls.Pop() spider.Initialize url ' Spider 5 URLs of this domain. ' but first, save the base domain in seenDomains domain = spider.GetUrlDomain(url) success = seenDomains.Append(spider.GetBaseDomain(domain)) For i = 0 To 4 success = spider.CrawlNext() If (success = True) Then ' Display the URL we just crawled. Debug.Print spider.LastUrl ' If the last URL was retrieved from cache, ' we won't wait. Otherwise we'll wait 1 second ' before fetching the next URL. If (spider.LastFromCache <> True) Then spider.SleepMs 1000 End If Else ' cause the loop to exit.. i = 999 End If Next ' Add the outbound links to seedUrls, except ' for the domains we've already seen. For i = 0 To spider.NumOutboundLinks - 1 url = spider.GetOutboundLink(i) domain = spider.GetUrlDomain(url) baseDomain = spider.GetBaseDomain(domain) If (seenDomains.Contains(baseDomain) = False) Then ' Don't let our list of seedUrls grow too large. If (seedUrls.Count < 1000) Then success = seedUrls.Append(url) End If End If Next Loop |
© 2000-2022 Chilkat Software, Inc. All Rights Reserved.