SQL Server
SQL Server
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat SQL Server Downloads
-- Important: See this note about string length limitations for strings returned by sp_OAMethod calls.
--
CREATE PROCEDURE ChilkatSample
AS
BEGIN
DECLARE @hr int
DECLARE @iTmp0 int
-- Important: Do not use nvarchar(max). See the warning about using nvarchar(max).
DECLARE @sTmp0 nvarchar(4000)
DECLARE @success int
SELECT @success = 0
DECLARE @spider int
EXEC @hr = sp_OACreate 'Chilkat.Spider', @spider OUT
IF @hr <> 0
BEGIN
PRINT 'Failed to create ActiveX component'
RETURN
END
DECLARE @seenDomains int
EXEC @hr = sp_OACreate 'Chilkat.StringArray', @seenDomains OUT
DECLARE @seedUrls int
EXEC @hr = sp_OACreate 'Chilkat.StringArray', @seedUrls OUT
EXEC sp_OASetProperty @seenDomains, 'Unique', 1
EXEC sp_OASetProperty @seedUrls, 'Unique', 1
-- You will need to change the start URL to something else...
EXEC sp_OAMethod @seedUrls, 'Append', @success OUT, 'http://something.whateverYouWant.com/'
-- Set outbound URL exclude patterns
-- URLs matching any of these patterns will not be added to the
-- collection of outbound links.
EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*?id=*'
EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.mypages.*'
EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.personal.*'
EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.comcast.*'
EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.aol.*'
EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*~*'
-- Use a cache so we don't have to re-fetch URLs previously fetched.
EXEC sp_OASetProperty @spider, 'CacheDir', 'c:/spiderCache/'
EXEC sp_OASetProperty @spider, 'FetchFromCache', 1
EXEC sp_OASetProperty @spider, 'UpdateCache', 1
EXEC sp_OAGetProperty @seedUrls, 'Count', @iTmp0 OUT
WHILE @iTmp0 > 0
BEGIN
DECLARE @url nvarchar(4000)
EXEC sp_OAMethod @seedUrls, 'Pop', @url OUT
EXEC sp_OAMethod @spider, 'Initialize', NULL, @url
-- Spider 5 URLs of this domain.
-- but first, save the base domain in seenDomains
DECLARE @domain nvarchar(4000)
EXEC sp_OAMethod @spider, 'GetUrlDomain', @domain OUT, @url
EXEC sp_OAMethod @spider, 'GetBaseDomain', @sTmp0 OUT, @domain
EXEC sp_OAMethod @seenDomains, 'Append', @success OUT, @sTmp0
DECLARE @i int
SELECT @i = 0
WHILE @i <= 4
BEGIN
EXEC sp_OAMethod @spider, 'CrawlNext', @success OUT
IF @success = 1
BEGIN
-- Display the URL we just crawled.
EXEC sp_OAGetProperty @spider, 'LastUrl', @sTmp0 OUT
PRINT @sTmp0
-- If the last URL was retrieved from cache,
-- we won't wait. Otherwise we'll wait 1 second
-- before fetching the next URL.
EXEC sp_OAGetProperty @spider, 'LastFromCache', @iTmp0 OUT
IF @iTmp0 <> 1
BEGIN
EXEC sp_OAMethod @spider, 'SleepMs', NULL, 1000
END
END
ELSE
BEGIN
-- cause the loop to exit..
SELECT @i = 999
END
SELECT @i = @i + 1
END
-- Add the outbound links to seedUrls, except
-- for the domains we've already seen.
EXEC sp_OAGetProperty @spider, 'NumOutboundLinks', @iTmp0 OUT
SELECT @i = 0
WHILE @i <= @iTmp0 - 1
BEGIN
EXEC sp_OAMethod @spider, 'GetOutboundLink', @url OUT, @i
DECLARE @domain nvarchar(4000)
EXEC sp_OAMethod @spider, 'GetUrlDomain', @domain OUT, @url
DECLARE @baseDomain nvarchar(4000)
EXEC sp_OAMethod @spider, 'GetBaseDomain', @baseDomain OUT, @domain
EXEC sp_OAMethod @seenDomains, 'Contains', @iTmp0 OUT, @baseDomain
IF @iTmp0 = 0
BEGIN
-- Don't let our list of seedUrls grow too large.
EXEC sp_OAGetProperty @seedUrls, 'Count', @iTmp0 OUT
IF @iTmp0 < 1000
BEGIN
EXEC sp_OAMethod @seedUrls, 'Append', @success OUT, @url
END
END
SELECT @i = @i + 1
END
END
EXEC @hr = sp_OADestroy @spider
EXEC @hr = sp_OADestroy @seenDomains
EXEC @hr = sp_OADestroy @seedUrls
END
GO