Marketing Rascal Marketing Rascal - 2 months ago 13
Ruby Question

Anemone - NoMethodError: undefined method `xpath' for nil:NilClass

I'm just starting to learn more about writing a web crawler in Ruby which is designed to crawl my blog and find broken external links using the Anemone gem and the rake task below...

task :testing_this => :environment do
require 'anemone'
rooter = 'myblog.com'
banned = ['tel:','@', '#', 'facebook.com', 'twitter.com', 'pinterest.com', 'linkedin.com', 'youtube.com','reddit.com', 'wikipedia.org']
extensions = %w( .jpg .jpeg .png .doc .pdf .js .css .xml .csv. .exe .zip .gzip )
start = Time.now
Anemone.crawl("http://#{rooter}/", {:threads => 4, :discard_page_bodies => false, :obey_robots_txt => false, :user_agent => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}) do |anemone|
begin
anemone.on_every_page do |page|
puts page.url.path
links = page.doc.xpath("//a/@href")
if (links != nil)
links.each do |link|
this_link = link.to_s
unless extensions.any? { |exten| this_link && this_link.include?(exten) }
unless banned.any? { |word| this_link && this_link.include?(word) }
unless this_link.include? rooter
# puts this_link
obl = URI.parse(URI.encode(this_link.strip)).host
unless obl.blank?
if obl.include? 'www.'
obl = obl.gsub("www.", "")
end
Obl.find_or_create_by(url: obl)
end
end
end
end
end
end
end
rescue OpenURI::HTTPError => ex
puts ex
end
end
time_t = Time.now - start
puts "-------------"
puts "#{time_t} seconds"
puts "-------------"
end


It's working on my demo folder, however, I've been using https://arthurdejong.org/webcheck/demo/ to test it and I'm getting the following error:

NoMethodError: undefined method `xpath' for nil:NilClass


I've tried building an array of extensions as I did wonder if that was what was causing the issue but so far no luck.

Does anyone have any tips on how to debug this or work the problem through?

Answer

It seems the error comes from page.doc.xpath : page.doc is nil.

Try to inspect your page in the on_every_page method.

You also can add a if before .xpath to avoid error:

anemone.on_every_page do |page|
  puts page.url.path
  if page.doc.present?
    links = page.doc.xpath("//a/@href")