# -*- coding: utf-8 -*-
import re
import csv
import time
try:
from PySide.QtGui import QApplication
from PySide.QtCore import QUrl, QEventLoop, QTimer
from PySide.QtWebKit import QWebView
except ImportError:
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QEventLoop, QTimer
from PyQt4.QtWebKit import QWebView
import lxml.html
class BrowserRender(QWebView):
def __init__(self, display=True):
self.app = QApplication([])
QWebView.__init__(self)
if display:
self.show() # show the browser
def open(self, url, timeout=60):
"""Wait for download to complete and return result"""
loop = QEventLoop()
timer = QTimer()
timer.setSingleShot(True)
timer.timeout.connect(loop.quit)
self.loadFinished.connect(loop.quit)
self.load(QUrl(url))
timer.start(timeout * 1000)
loop.exec_() # delay here until download finished
if timer.isActive():
# downloaded successfully
timer.stop()
return self.html()
else:
# timed out
print 'Request timed out:', url
def html(self):
"""Shortcut to return the current HTML"""
return self.page().mainFrame().toHtml()
def find(self, pattern):
"""Find all elements that match the pattern"""
return self.page().mainFrame().findAllElements(pattern)
def attr(self, pattern, name, value):
"""Set attribute for matching elements"""
for e in self.find(pattern):
e.setAttribute(name, value)
def text(self, pattern, value):
"""Set attribute for matching elements"""
for e in self.find(pattern):
e.setPlainText(value)
def click(self, pattern):
"""Click matching elements"""
for e in self.find(pattern):
e.evaluateJavaScript("this.click()")
def wait_load(self, pattern, timeout=60):
"""Wait for this pattern to be found in webpage and return matches"""
deadline = time.time() + timeout
while time.time() < deadline:
self.app.processEvents()
matches = self.find(pattern)
if matches:
return matches
print 'Wait load timed out'
def main():
br = BrowserRender()
br.open('http://example.webscraping.com/search')
br.attr('#search_term', 'value', '.')
br.text('#page_size option:checked', '1000')
br.click('#search')
elements = br.wait_load('#results a')
writer = csv.writer(open('countries.csv', 'w'))
for country in [e.toPlainText().strip() for e in elements]:
writer.writerow([country])
if __name__ == '__main__':
main()