Module: Arachnid2::Exoskeleton

Included in:
Typhoeus, Watir
Defined in:
lib/arachnid2/exoskeleton.rb

Instance Method Summary collapse

Instance Method Details

#bound_timeObject Also known as: time_limit



62
63
64
65
66
67
68
# File 'lib/arachnid2/exoskeleton.rb', line 62

def bound_time
  boundary = "#{@options[:time_box]}".to_i
  boundary = BASE_CRAWL_TIME if boundary <= 0
  boundary = MAX_CRAWL_TIME  if boundary >  MAX_CRAWL_TIME

  return Time.now + boundary
end

#bound_urlsObject Also known as: max_urls



70
71
72
73
74
75
76
# File 'lib/arachnid2/exoskeleton.rb', line 70

def bound_urls
  amount = "#{@options[:max_urls]}".to_i
  amount = BASE_URLS if amount <= 0
  amount = MAX_URLS  if amount >  MAX_URLS

  amount
end

#browser_typeObject



3
4
5
6
7
8
9
10
# File 'lib/arachnid2/exoskeleton.rb', line 3

def browser_type
  unless @browser_type
    @browser_type   = "#{@options[:browser_type]}".to_sym if @options[:browser_type]
    @browser_type ||= :firefox
  end

  @browser_type
end

#crawl_optionsObject



88
89
90
# File 'lib/arachnid2/exoskeleton.rb', line 88

def crawl_options
  @crawl_options ||= { max_urls: max_urls, time_limit: time_limit }
end

#extension_ignored?(url) ⇒ Boolean

Returns:

  • (Boolean)


104
105
106
107
108
# File 'lib/arachnid2/exoskeleton.rb', line 104

def extension_ignored?(url)
  return false if url.empty?

  !non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
end

#extract_hrefs(body) ⇒ Object



18
19
20
21
# File 'lib/arachnid2/exoskeleton.rb', line 18

def extract_hrefs(body)
  elements = Nokogiri::HTML.parse(body).css('a')
  return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
end

#in_docker?Boolean

Returns:

  • (Boolean)


121
122
123
# File 'lib/arachnid2/exoskeleton.rb', line 121

def in_docker?
  File.file?(MEMORY_USE_FILE)
end

#internal_link?(absolute_url) ⇒ Boolean

Returns:

  • (Boolean)


100
101
102
# File 'lib/arachnid2/exoskeleton.rb', line 100

def internal_link?(absolute_url)
  "#{Adomain[absolute_url]}".include? @domain
end

#make_absolute(href, root) ⇒ Object



96
97
98
# File 'lib/arachnid2/exoskeleton.rb', line 96

def make_absolute(href, root)
  Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
end

#maximum_load_rateObject



125
126
127
128
129
130
131
# File 'lib/arachnid2/exoskeleton.rb', line 125

def maximum_load_rate
  return @maximum_load_rate if @maximum_load_rate

  @maximum_load_rate = "#{@options[:memory_limit]}".to_f
  @maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
  @maximum_load_rate
end

#memory_danger?Boolean

Returns:

  • (Boolean)


110
111
112
113
114
115
116
117
118
119
# File 'lib/arachnid2/exoskeleton.rb', line 110

def memory_danger?
  return false unless in_docker?

  use      = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
  @limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f

  return false unless ( (use > 0.0) && (@limit > 0.0) )

  return ( ( (use / @limit) * 100.0 ) >= maximum_load_rate )
end

#non_html_extensionsObject



55
56
57
58
59
60
# File 'lib/arachnid2/exoskeleton.rb', line 55

def non_html_extensions
  return @non_html_extensions if @non_html_extensions

  @non_html_extensions   = @options[:non_html_extensions]
  @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
end

#preflight(opts) ⇒ Object



45
46
47
48
49
# File 'lib/arachnid2/exoskeleton.rb', line 45

def preflight(opts)
  @options = opts
  @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
  @global_queue = [@url]
end

#process(url, html) ⇒ Object



12
13
14
15
16
# File 'lib/arachnid2/exoskeleton.rb', line 12

def process(url, html)
  return false unless Adomain["#{url}"]&.include? @domain

  extract_hrefs(html)
end

#proxyObject



51
52
53
# File 'lib/arachnid2/exoskeleton.rb', line 51

def proxy
  @options[:proxy]
end

#skip_link?(absolute_link) ⇒ Boolean

Returns:

  • (Boolean)


38
39
40
41
42
43
# File 'lib/arachnid2/exoskeleton.rb', line 38

def skip_link?(absolute_link)
  !internal_link?(absolute_link) || \
  @global_visited.include?(absolute_link) || \
  extension_ignored?(absolute_link) || \
  @global_queue.include?(absolute_link)
end

#timeoutObject



78
79
80
81
82
83
84
85
86
# File 'lib/arachnid2/exoskeleton.rb', line 78

def timeout
  unless @timeout
    @timeout = @options[:timeout]
    @timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
    @timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
    @timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
  end
  @timeout
end

#vacuum(links, url) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/arachnid2/exoskeleton.rb', line 23

def vacuum(links, url)
  links.each do |link|
    next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)

    begin
      absolute_link = make_absolute(link, url)

      next if skip_link?(absolute_link)

      @global_queue << absolute_link
    rescue Addressable::URI::InvalidURIError
    end
  end
end