Chilkat HOME .NET Core C# Android™ AutoIt C C# C++ Chilkat2-Python CkPython Classic ASP DataFlex Delphi ActiveX Delphi DLL Go Java Lianja Mono C# Node.js Objective-C PHP ActiveX PHP Extension Perl PowerBuilder PowerShell PureBasic Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ VB.NET VBScript Visual Basic 6.0 Visual FoxPro Xojo Plugin
(AutoIt) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
$oSpider = ObjCreate("Chilkat.Spider") $oSeenDomains = ObjCreate("Chilkat.StringArray") $oSeedUrls = ObjCreate("Chilkat.StringArray") $oSeenDomains.Unique = True $oSeedUrls.Unique = True ; You will need to change the start URL to something else... $oSeedUrls.Append("http://something.whateverYouWant.com/") ; Set outbound URL exclude patterns ; URLs matching any of these patterns will not be added to the ; collection of outbound links. $oSpider.AddAvoidOutboundLinkPattern "*?id=*" $oSpider.AddAvoidOutboundLinkPattern "*.mypages.*" $oSpider.AddAvoidOutboundLinkPattern "*.personal.*" $oSpider.AddAvoidOutboundLinkPattern "*.comcast.*" $oSpider.AddAvoidOutboundLinkPattern "*.aol.*" $oSpider.AddAvoidOutboundLinkPattern "*~*" ; Use a cache so we don't have to re-fetch URLs previously fetched. $oSpider.CacheDir = "c:/spiderCache/" $oSpider.FetchFromCache = True $oSpider.UpdateCache = True While $oSeedUrls.Count > 0 Local $sUrl = $oSeedUrls.Pop() $oSpider.Initialize $sUrl ; Spider 5 URLs of this domain. ; but first, save the base domain in seenDomains Local $sDomain = $oSpider.GetUrlDomain($sUrl) $oSeenDomains.Append($oSpider.GetBaseDomain($sDomain)) Local $i Local $bSuccess For $i = 0 To 4 $bSuccess = $oSpider.CrawlNext() If ($bSuccess = True) Then ; Display the URL we just crawled. ConsoleWrite($oSpider.LastUrl & @CRLF) ; If the last URL was retrieved from cache, ; we won't wait. Otherwise we'll wait 1 second ; before fetching the next URL. If ($oSpider.LastFromCache <> True) Then $oSpider.SleepMs 1000 EndIf Else ; cause the loop to exit.. $i = 999 EndIf Next ; Add the outbound links to seedUrls, except ; for the domains we've already seen. For $i = 0 To $oSpider.NumOutboundLinks - 1 $sUrl = $oSpider.GetOutboundLink($i) Local $sDomain = $oSpider.GetUrlDomain($sUrl) Local $sBaseDomain = $oSpider.GetBaseDomain($sDomain) If ($oSeenDomains.Contains($sBaseDomain) = False) Then ; Don't let our list of seedUrls grow too large. If ($oSeedUrls.Count < 1000) Then $oSeedUrls.Append($sUrl) EndIf EndIf Next Wend |
© 2000-2024 Chilkat Software, Inc. All Rights Reserved.