Swift
Swift
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat Swift Downloads
func chilkatTest() {
var success: Bool = false
let spider = CkoSpider()!
let seenDomains = CkoStringArray()!
let seedUrls = CkoStringArray()!
seenDomains.unique = true
seedUrls.unique = true
// You will need to change the start URL to something else...
seedUrls.append(str: "http://something.whateverYouWant.com/")
// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the
// collection of outbound links.
spider.addAvoidOutboundLinkPattern(pattern: "*?id=*")
spider.addAvoidOutboundLinkPattern(pattern: "*.mypages.*")
spider.addAvoidOutboundLinkPattern(pattern: "*.personal.*")
spider.addAvoidOutboundLinkPattern(pattern: "*.comcast.*")
spider.addAvoidOutboundLinkPattern(pattern: "*.aol.*")
spider.addAvoidOutboundLinkPattern(pattern: "*~*")
// Use a cache so we don't have to re-fetch URLs previously fetched.
spider.cacheDir = "c:/spiderCache/"
spider.fetchFromCache = true
spider.updateCache = true
while seedUrls.count.intValue > 0 {
var url: String? = seedUrls.pop()
spider.initialize(domain: url)
// Spider 5 URLs of this domain.
// but first, save the base domain in seenDomains
var domain: String? = spider.getUrlDomain(url: url)
seenDomains.append(str: spider.getBaseDomain(domain: domain))
var i: Int
for i = 0; i <= 4; i++ {
success = spider.crawlNext()
if success == true {
// Display the URL we just crawled.
print("\(spider.lastUrl!)")
// If the last URL was retrieved from cache,
// we won't wait. Otherwise we'll wait 1 second
// before fetching the next URL.
if spider.lastFromCache != true {
spider.sleepMs(millisec: 1000)
}
}
else {
// cause the loop to exit..
i = 999
}
}
// Add the outbound links to seedUrls, except
// for the domains we've already seen.
for i = 0; i <= spider.numOutboundLinks.intValue - 1; i++ {
url = spider.getOutboundLink(index: i)
var domain: String? = spider.getUrlDomain(url: url)
var baseDomain: String? = spider.getBaseDomain(domain: domain)
if seenDomains.contains(str: baseDomain) == false {
// Don't let our list of seedUrls grow too large.
if seedUrls.count.intValue < 1000 {
seedUrls.append(str: url)
}
}
}
}
}