Chilkat HOME .NET Core C# Android™ AutoIt C C# C++ Chilkat2-Python CkPython Classic ASP DataFlex Delphi ActiveX Delphi DLL Go Java Lianja Mono C# Node.js Objective-C PHP ActiveX PHP Extension Perl PowerBuilder PowerShell PureBasic Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ VB.NET VBScript Visual Basic 6.0 Visual FoxPro Xojo Plugin
(Visual FoxPro) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
LOCAL loSpider LOCAL loSeenDomains LOCAL loSeedUrls LOCAL lcUrl LOCAL lcDomain LOCAL i LOCAL lnSuccess LOCAL lcDomain LOCAL lcBaseDomain * For versions of Chilkat < 10.0.0, use CreateObject('Chilkat_9_5_0.Spider') loSpider = CreateObject('Chilkat.Spider') * For versions of Chilkat < 10.0.0, use CreateObject('Chilkat_9_5_0.StringArray') loSeenDomains = CreateObject('Chilkat.StringArray') * For versions of Chilkat < 10.0.0, use CreateObject('Chilkat_9_5_0.StringArray') loSeedUrls = CreateObject('Chilkat.StringArray') loSeenDomains.Unique = 1 loSeedUrls.Unique = 1 * You will need to change the start URL to something else... loSeedUrls.Append("http://something.whateverYouWant.com/") * Set outbound URL exclude patterns * URLs matching any of these patterns will not be added to the * collection of outbound links. loSpider.AddAvoidOutboundLinkPattern("*?id=*") loSpider.AddAvoidOutboundLinkPattern("*.mypages.*") loSpider.AddAvoidOutboundLinkPattern("*.personal.*") loSpider.AddAvoidOutboundLinkPattern("*.comcast.*") loSpider.AddAvoidOutboundLinkPattern("*.aol.*") loSpider.AddAvoidOutboundLinkPattern("*~*") * Use a cache so we don't have to re-fetch URLs previously fetched. loSpider.CacheDir = "c:/spiderCache/" loSpider.FetchFromCache = 1 loSpider.UpdateCache = 1 DO WHILE loSeedUrls.Count > 0 lcUrl = loSeedUrls.Pop() loSpider.Initialize(lcUrl) * Spider 5 URLs of this domain. * but first, save the base domain in seenDomains lcDomain = loSpider.GetUrlDomain(lcUrl) loSeenDomains.Append(loSpider.GetBaseDomain(lcDomain)) FOR i = 0 TO 4 lnSuccess = loSpider.CrawlNext() IF (lnSuccess = 1) THEN * Display the URL we just crawled. ? loSpider.LastUrl * If the last URL was retrieved from cache, * we won't wait. Otherwise we'll wait 1 second * before fetching the next URL. IF (loSpider.LastFromCache <> 1) THEN loSpider.SleepMs(1000) ENDIF ELSE * cause the loop to exit.. i = 999 ENDIF NEXT * Add the outbound links to seedUrls, except * for the domains we've already seen. FOR i = 0 TO loSpider.NumOutboundLinks - 1 lcUrl = loSpider.GetOutboundLink(i) lcDomain = loSpider.GetUrlDomain(lcUrl) lcBaseDomain = loSpider.GetBaseDomain(lcDomain) IF (loSeenDomains.Contains(lcBaseDomain) = 0) THEN * Don't let our list of seedUrls grow too large. IF (loSeedUrls.Count < 1000) THEN loSeedUrls.Append(lcUrl) ENDIF ENDIF NEXT ENDDO RELEASE loSpider RELEASE loSeenDomains RELEASE loSeedUrls |
© 2000-2024 Chilkat Software, Inc. All Rights Reserved.