Simple PhantomJS web scraping script
Here is a simple web scraping script I wrote for PhantomJS, the immensely useful headless browser, to load a page, inject jQuery into it, and then scrape the page using a user-supplied jQuery selector.
page = require('webpage').create()
system = require 'system'
phantom.injectJs "static/js/underscore-min.js"
page.onConsoleMessage = (msg) ->
if not msg.match /^Unsafe/
console.log msg
scrapeEl = (elselector) ->
rows = $ elselector
for el in rows
if el.innerHTML
str = el.innerHTML.trim()
if str.length > 0
console.log str
page.open system.args[1], (status) ->
if status isnt 'success'
phantom.exit 1
else
page.injectJs "static/js/underscore-min.js"
page.injectJs "static/js/utils.js"
page.injectJs "static/js/jquery-1.8.2.min.js"
page.evaluate scrapeEl, system.args[2]
phantom.exit()
Run it with:
phantomjs scrape_element.coffee "http://www.moviefone.com/coming-soon" ".movieTitle span"