Chilkat HOME .NET Core C# Android™ AutoIt C C# C++ Chilkat2-Python CkPython Classic ASP DataFlex Delphi ActiveX Delphi DLL Go Java Lianja Mono C# Node.js Objective-C PHP ActiveX PHP Extension Perl PowerBuilder PowerShell PureBasic Ruby SQL Server Swift 2 Swift 3,4,5... Tcl Unicode C Unicode C++ VB.NET VBScript Visual Basic 6.0 Visual FoxPro Xojo Plugin
(SQL Server) A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
-- Important: See this note about string length limitations for strings returned by sp_OAMethod calls. -- CREATE PROCEDURE ChilkatSample AS BEGIN DECLARE @hr int DECLARE @iTmp0 int -- Important: Do not use nvarchar(max). See the warning about using nvarchar(max). DECLARE @sTmp0 nvarchar(4000) DECLARE @spider int -- Use "Chilkat_9_5_0.Spider" for versions of Chilkat < 10.0.0 EXEC @hr = sp_OACreate 'Chilkat.Spider', @spider OUT IF @hr <> 0 BEGIN PRINT 'Failed to create ActiveX component' RETURN END DECLARE @seenDomains int -- Use "Chilkat_9_5_0.StringArray" for versions of Chilkat < 10.0.0 EXEC @hr = sp_OACreate 'Chilkat.StringArray', @seenDomains OUT DECLARE @seedUrls int -- Use "Chilkat_9_5_0.StringArray" for versions of Chilkat < 10.0.0 EXEC @hr = sp_OACreate 'Chilkat.StringArray', @seedUrls OUT EXEC sp_OASetProperty @seenDomains, 'Unique', 1 EXEC sp_OASetProperty @seedUrls, 'Unique', 1 -- You will need to change the start URL to something else... DECLARE @success int EXEC sp_OAMethod @seedUrls, 'Append', @success OUT, 'http://something.whateverYouWant.com/' -- Set outbound URL exclude patterns -- URLs matching any of these patterns will not be added to the -- collection of outbound links. EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*?id=*' EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.mypages.*' EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.personal.*' EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.comcast.*' EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.aol.*' EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*~*' -- Use a cache so we don't have to re-fetch URLs previously fetched. EXEC sp_OASetProperty @spider, 'CacheDir', 'c:/spiderCache/' EXEC sp_OASetProperty @spider, 'FetchFromCache', 1 EXEC sp_OASetProperty @spider, 'UpdateCache', 1 EXEC sp_OAGetProperty @seedUrls, 'Count', @iTmp0 OUT WHILE @iTmp0 > 0 BEGIN DECLARE @url nvarchar(4000) EXEC sp_OAMethod @seedUrls, 'Pop', @url OUT EXEC sp_OAMethod @spider, 'Initialize', NULL, @url -- Spider 5 URLs of this domain. -- but first, save the base domain in seenDomains DECLARE @domain nvarchar(4000) EXEC sp_OAMethod @spider, 'GetUrlDomain', @domain OUT, @url EXEC sp_OAMethod @spider, 'GetBaseDomain', @sTmp0 OUT, @domain EXEC sp_OAMethod @seenDomains, 'Append', @success OUT, @sTmp0 DECLARE @i int DECLARE @success int SELECT @i = 0 WHILE @i <= 4 BEGIN EXEC sp_OAMethod @spider, 'CrawlNext', @success OUT IF @success = 1 BEGIN -- Display the URL we just crawled. EXEC sp_OAGetProperty @spider, 'LastUrl', @sTmp0 OUT PRINT @sTmp0 -- If the last URL was retrieved from cache, -- we won't wait. Otherwise we'll wait 1 second -- before fetching the next URL. EXEC sp_OAGetProperty @spider, 'LastFromCache', @iTmp0 OUT IF @iTmp0 <> 1 BEGIN EXEC sp_OAMethod @spider, 'SleepMs', NULL, 1000 END END ELSE BEGIN -- cause the loop to exit.. SELECT @i = 999 END SELECT @i = @i + 1 END -- Add the outbound links to seedUrls, except -- for the domains we've already seen. EXEC sp_OAGetProperty @spider, 'NumOutboundLinks', @iTmp0 OUT SELECT @i = 0 WHILE @i <= @iTmp0 - 1 BEGIN EXEC sp_OAMethod @spider, 'GetOutboundLink', @url OUT, @i DECLARE @domain nvarchar(4000) EXEC sp_OAMethod @spider, 'GetUrlDomain', @domain OUT, @url DECLARE @baseDomain nvarchar(4000) EXEC sp_OAMethod @spider, 'GetBaseDomain', @baseDomain OUT, @domain EXEC sp_OAMethod @seenDomains, 'Contains', @iTmp0 OUT, @baseDomain IF @iTmp0 = 0 BEGIN -- Don't let our list of seedUrls grow too large. EXEC sp_OAGetProperty @seedUrls, 'Count', @iTmp0 OUT IF @iTmp0 < 1000 BEGIN EXEC sp_OAMethod @seedUrls, 'Append', @success OUT, @url END END SELECT @i = @i + 1 END END EXEC @hr = sp_OADestroy @spider EXEC @hr = sp_OADestroy @seenDomains EXEC @hr = sp_OADestroy @seedUrls END GO |
© 2000-2024 Chilkat Software, Inc. All Rights Reserved.