Sample code for 30+ languages & platforms
SQL Server

A Simple Web Crawler

See more Spider Examples

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat SQL Server Downloads

SQL Server
-- Important: See this note about string length limitations for strings returned by sp_OAMethod calls.
--
CREATE PROCEDURE ChilkatSample
AS
BEGIN
    DECLARE @hr int
    DECLARE @iTmp0 int
    -- Important: Do not use nvarchar(max).  See the warning about using nvarchar(max).
    DECLARE @sTmp0 nvarchar(4000)
    DECLARE @success int
    SELECT @success = 0

    DECLARE @spider int
    EXEC @hr = sp_OACreate 'Chilkat.Spider', @spider OUT
    IF @hr <> 0
    BEGIN
        PRINT 'Failed to create ActiveX component'
        RETURN
    END

    DECLARE @seenDomains int
    EXEC @hr = sp_OACreate 'Chilkat.StringArray', @seenDomains OUT

    DECLARE @seedUrls int
    EXEC @hr = sp_OACreate 'Chilkat.StringArray', @seedUrls OUT

    EXEC sp_OASetProperty @seenDomains, 'Unique', 1
    EXEC sp_OASetProperty @seedUrls, 'Unique', 1

    -- You will need to change the start URL to something else...
    EXEC sp_OAMethod @seedUrls, 'Append', @success OUT, 'http://something.whateverYouWant.com/'

    -- Set outbound URL exclude patterns
    -- URLs matching any of these patterns will not be added to the 
    -- collection of outbound links.
    EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*?id=*'
    EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.mypages.*'
    EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.personal.*'
    EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.comcast.*'
    EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*.aol.*'
    EXEC sp_OAMethod @spider, 'AddAvoidOutboundLinkPattern', NULL, '*~*'

    -- Use a cache so we don't have to re-fetch URLs previously fetched.
    EXEC sp_OASetProperty @spider, 'CacheDir', 'c:/spiderCache/'
    EXEC sp_OASetProperty @spider, 'FetchFromCache', 1
    EXEC sp_OASetProperty @spider, 'UpdateCache', 1

    EXEC sp_OAGetProperty @seedUrls, 'Count', @iTmp0 OUT
    WHILE @iTmp0 > 0
      BEGIN

        DECLARE @url nvarchar(4000)
        EXEC sp_OAMethod @seedUrls, 'Pop', @url OUT
        EXEC sp_OAMethod @spider, 'Initialize', NULL, @url

        -- Spider 5 URLs of this domain.
        -- but first, save the base domain in seenDomains
        DECLARE @domain nvarchar(4000)
        EXEC sp_OAMethod @spider, 'GetUrlDomain', @domain OUT, @url
        EXEC sp_OAMethod @spider, 'GetBaseDomain', @sTmp0 OUT, @domain
        EXEC sp_OAMethod @seenDomains, 'Append', @success OUT, @sTmp0

        DECLARE @i int

        SELECT @i = 0
        WHILE @i <= 4
          BEGIN
            EXEC sp_OAMethod @spider, 'CrawlNext', @success OUT
            IF @success = 1
              BEGIN

                -- Display the URL we just crawled.
                EXEC sp_OAGetProperty @spider, 'LastUrl', @sTmp0 OUT
                PRINT @sTmp0

                -- If the last URL was retrieved from cache,
                -- we won't wait.  Otherwise we'll wait 1 second
                -- before fetching the next URL.
                EXEC sp_OAGetProperty @spider, 'LastFromCache', @iTmp0 OUT
                IF @iTmp0 <> 1
                  BEGIN
                    EXEC sp_OAMethod @spider, 'SleepMs', NULL, 1000
                  END

              END
            ELSE
              BEGIN
                -- cause the loop to exit..
                SELECT @i = 999
              END

            SELECT @i = @i + 1
          END

        -- Add the outbound links to seedUrls, except
        -- for the domains we've already seen.
        EXEC sp_OAGetProperty @spider, 'NumOutboundLinks', @iTmp0 OUT
        SELECT @i = 0
        WHILE @i <= @iTmp0 - 1
          BEGIN

            EXEC sp_OAMethod @spider, 'GetOutboundLink', @url OUT, @i
            DECLARE @domain nvarchar(4000)
            EXEC sp_OAMethod @spider, 'GetUrlDomain', @domain OUT, @url
            DECLARE @baseDomain nvarchar(4000)
            EXEC sp_OAMethod @spider, 'GetBaseDomain', @baseDomain OUT, @domain
            EXEC sp_OAMethod @seenDomains, 'Contains', @iTmp0 OUT, @baseDomain
            IF @iTmp0 = 0
              BEGIN
                -- Don't let our list of seedUrls grow too large.
                EXEC sp_OAGetProperty @seedUrls, 'Count', @iTmp0 OUT
                IF @iTmp0 < 1000
                  BEGIN
                    EXEC sp_OAMethod @seedUrls, 'Append', @success OUT, @url
                  END
              END

            SELECT @i = @i + 1
          END

      END

    EXEC @hr = sp_OADestroy @spider
    EXEC @hr = sp_OADestroy @seenDomains
    EXEC @hr = sp_OADestroy @seedUrls


END
GO