Module: Arachnid2::Exoskeleton
Instance Method Summary collapse
- #bound_time ⇒ Object (also: #time_limit)
- #bound_urls ⇒ Object (also: #max_urls)
- #browser_type ⇒ Object
- #crawl_options ⇒ Object
- #extension_ignored?(url) ⇒ Boolean
- #extract_hrefs(body) ⇒ Object
- #in_docker? ⇒ Boolean
- #internal_link?(absolute_url) ⇒ Boolean
- #make_absolute(href, root) ⇒ Object
- #maximum_load_rate ⇒ Object
- #memory_danger? ⇒ Boolean
- #non_html_extensions ⇒ Object
- #preflight(opts) ⇒ Object
- #process(url, html) ⇒ Object
- #proxy ⇒ Object
- #skip_link?(absolute_link) ⇒ Boolean
- #timeout ⇒ Object
- #vacuum(links, url) ⇒ Object
Instance Method Details
#bound_time ⇒ Object Also known as: time_limit
62 63 64 65 66 67 68 |
# File 'lib/arachnid2/exoskeleton.rb', line 62 def bound_time boundary = "#{@options[:time_box]}".to_i boundary = BASE_CRAWL_TIME if boundary <= 0 boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME return Time.now + boundary end |
#bound_urls ⇒ Object Also known as: max_urls
70 71 72 73 74 75 76 |
# File 'lib/arachnid2/exoskeleton.rb', line 70 def bound_urls amount = "#{@options[:max_urls]}".to_i amount = BASE_URLS if amount <= 0 amount = MAX_URLS if amount > MAX_URLS amount end |
#browser_type ⇒ Object
3 4 5 6 7 8 9 10 |
# File 'lib/arachnid2/exoskeleton.rb', line 3 def browser_type unless @browser_type @browser_type = "#{@options[:browser_type]}".to_sym if @options[:browser_type] @browser_type ||= :firefox end @browser_type end |
#crawl_options ⇒ Object
88 89 90 |
# File 'lib/arachnid2/exoskeleton.rb', line 88 def @crawl_options ||= { max_urls: max_urls, time_limit: time_limit } end |
#extension_ignored?(url) ⇒ Boolean
104 105 106 107 108 |
# File 'lib/arachnid2/exoskeleton.rb', line 104 def extension_ignored?(url) return false if url.empty? !non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil? end |
#extract_hrefs(body) ⇒ Object
18 19 20 21 |
# File 'lib/arachnid2/exoskeleton.rb', line 18 def extract_hrefs(body) elements = Nokogiri::HTML.parse(body).css('a') return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? } end |
#in_docker? ⇒ Boolean
121 122 123 |
# File 'lib/arachnid2/exoskeleton.rb', line 121 def in_docker? File.file?(MEMORY_USE_FILE) end |
#internal_link?(absolute_url) ⇒ Boolean
100 101 102 |
# File 'lib/arachnid2/exoskeleton.rb', line 100 def internal_link?(absolute_url) "#{Adomain[absolute_url]}".include? @domain end |
#make_absolute(href, root) ⇒ Object
96 97 98 |
# File 'lib/arachnid2/exoskeleton.rb', line 96 def make_absolute(href, root) Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s end |
#maximum_load_rate ⇒ Object
125 126 127 128 129 130 131 |
# File 'lib/arachnid2/exoskeleton.rb', line 125 def maximum_load_rate return @maximum_load_rate if @maximum_load_rate @maximum_load_rate = "#{@options[:memory_limit]}".to_f @maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0)) @maximum_load_rate end |
#memory_danger? ⇒ Boolean
110 111 112 113 114 115 116 117 118 119 |
# File 'lib/arachnid2/exoskeleton.rb', line 110 def memory_danger? return false unless in_docker? use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f @limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f return false unless ( (use > 0.0) && (@limit > 0.0) ) return ( ( (use / @limit) * 100.0 ) >= maximum_load_rate ) end |
#non_html_extensions ⇒ Object
55 56 57 58 59 60 |
# File 'lib/arachnid2/exoskeleton.rb', line 55 def non_html_extensions return @non_html_extensions if @non_html_extensions @non_html_extensions = @options[:non_html_extensions] @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS end |
#preflight(opts) ⇒ Object
45 46 47 48 49 |
# File 'lib/arachnid2/exoskeleton.rb', line 45 def preflight(opts) @options = opts @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true) @global_queue = [@url] end |
#process(url, html) ⇒ Object
12 13 14 15 16 |
# File 'lib/arachnid2/exoskeleton.rb', line 12 def process(url, html) return false unless Adomain["#{url}"]&.include? @domain extract_hrefs(html) end |
#proxy ⇒ Object
51 52 53 |
# File 'lib/arachnid2/exoskeleton.rb', line 51 def proxy @options[:proxy] end |
#skip_link?(absolute_link) ⇒ Boolean
38 39 40 41 42 43 |
# File 'lib/arachnid2/exoskeleton.rb', line 38 def skip_link?(absolute_link) !internal_link?(absolute_link) || \ @global_visited.include?(absolute_link) || \ extension_ignored?(absolute_link) || \ @global_queue.include?(absolute_link) end |
#timeout ⇒ Object
78 79 80 81 82 83 84 85 86 |
# File 'lib/arachnid2/exoskeleton.rb', line 78 def timeout unless @timeout @timeout = @options[:timeout] @timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer) @timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT @timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT end @timeout end |
#vacuum(links, url) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/arachnid2/exoskeleton.rb', line 23 def vacuum(links, url) links.each do |link| next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/) begin absolute_link = make_absolute(link, url) next if skip_link?(absolute_link) @global_queue << absolute_link rescue Addressable::URI::InvalidURIError end end end |