Skip to content

Commit 91559bd

Browse files
committed
follow links to subdomains if :crawl_subdomains => true
1 parent c0f75cc commit 91559bd

File tree

3 files changed

+26
-10
lines changed

3 files changed

+26
-10
lines changed

lib/anemone/core.rb

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ class Core
5454
:link_queue => Queue.new,
5555
# Manager for the page processing queue
5656
:page_queue => Queue.new,
57+
# Crawl subdomains?
58+
:crawl_subdomains => false,
5759
}
5860

5961
# Create setter methods for all options to be called from the crawl block
@@ -70,6 +72,7 @@ class Core
7072
def initialize(urls, opts = {})
7173
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
7274
@urls.each{ |url| url.path = '/' if url.path.empty? }
75+
@valid_domains = @urls.map{|u| [u.host,u.host.gsub(/^www\./,'.')]}.flatten.compact.uniq
7376

7477
@tentacles = []
7578
@on_every_page_blocks = []
@@ -254,7 +257,16 @@ def visit_link?(link, from_page = nil)
254257
!skip_link?(link) &&
255258
!skip_query_string?(link) &&
256259
allowed(link) &&
257-
!too_deep?(from_page)
260+
!too_deep?(from_page) &&
261+
(in_allowed_domain?(link) or in_allowed_subdomain?(link))
262+
end
263+
264+
def in_allowed_domain?(link)
265+
@valid_domains.index(link.host)
266+
end
267+
268+
def in_allowed_subdomain?(link)
269+
opts[:crawl_subdomains] and @valid_domains.find{|domain| link.host.end_with?(domain)}
258270
end
259271

260272
#

lib/anemone/page.rb

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def links
6363
u = a['href']
6464
next if u.nil? or u.empty?
6565
abs = to_absolute(URI(u)) rescue next
66-
@links << abs if in_domain?(abs)
66+
@links << abs
6767
end
6868
@links.uniq!
6969
@links
@@ -158,14 +158,6 @@ def to_absolute(link)
158158
return absolute
159159
end
160160

161-
#
162-
# Returns +true+ if *uri* is in the same domain as the page, returns
163-
# +false+ otherwise
164-
#
165-
def in_domain?(uri)
166-
uri.host == @url.host
167-
end
168-
169161
def marshal_dump
170162
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
171163
end

spec/core_spec.rb

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,18 @@ module Anemone
4141
core.pages.keys.should_not include('http://www.other.com/')
4242
end
4343

44+
it "should follow links to subdomains" do
45+
pages = []
46+
pages << FakePage.new('0', :links => ['1'], :hrefs => [ 'http://www.other.com/', 'http://subdomain.example.com/'] )
47+
pages << FakePage.new('1')
48+
49+
core = Anemone.crawl(pages[0].url, @opts.merge({:crawl_subdomains => true}))
50+
51+
core.should have(3).pages
52+
core.pages.keys.should_not include('http://www.other.com/')
53+
core.pages.keys.should include('http://subdomain.example.com/')
54+
end
55+
4456
it "should follow http redirects" do
4557
pages = []
4658
pages << FakePage.new('0', :links => ['1'])

0 commit comments

Comments
 (0)