Dheepan Manoharan Dheepan Manoharan - 5 months ago 78
Python Question

pyqt4: Loop main Render class?

I have a PyQt4 class that downloads webpages that I use for scrapping purposes.

When I pass a list of urls to the Render class while instantiating it works fine(single call) but when I try to loop the [r = Render(url, cb=scrape)]with a multiple list of urls, after the first loop,the execution stops or hangs without any error thrown.

I want to loop the class separately because the urls list belong to different category and will have to store the contents extracted separately.

I also came to know that only one app can be initiated, if that is the case how to exit the app without quitting it. so that new url list can be used by the same app

I am stuck with this issue for a while. Thanks in advance

import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *

class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()

def crawl(self):
if self.urls:
url = self.urls.pop(0)
print 'Downloading', url
self.mainFrame().load(QUrl(url))
else:
self.app.quit()

def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()

def scrape(url, html):
pass # have scraping code here

url1 = ['http://webscraping.com', 'http://webscraping.com/blog']
url2 = ['http://webscraping.com', 'http://webscraping.com/blog']
urls =[]
urls.append(url1)
urls.append(url2)

for url in urls:
r = Render(url, cb=scrape)

Answer

The problem is you can only instantiate a single QApplication object. Here is an updated version that avoids this and then only runs Qt's execution loop when downloading a URL:

import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage

class Render(QWebPage):
  def __init__(self, cb):
    self.app = QApplication(sys.argv)
    QWebPage.__init__(self)
    self.loadFinished.connect(self._loadFinished)
    self.cb = cb

  def crawl(self, url):
    print 'Downloading', url
    self.mainFrame().load(QUrl(url))
    self.app.exec_()

  def _loadFinished(self, result):
    frame = self.mainFrame()
    url = str(frame.url().toString())
    html = frame.toHtml()
    self.cb(url, html)
    self.app.quit()


def scrape(url, html):
  pass # add scraping code here
  print len(html)


r = Render(cb=scrape)
urls = ['http://webscraping.com', 'http://webscraping.com/blog']
for url in urls:
    r.crawl(url)