user215379 user215379 - 6 months ago 38
Python Question

Python Webdriver Multithread

I'm trying to spawn multiple webdriver instances with the code from: http://www.ibm.com/developerworks/aix/library/au-threadingpython/

import time
import Queue
import urllib2
import threading
from selenium import webdriver
from BeautifulSoup import BeautifulSoup
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
"http://ibm.com", "http://apple.com"]
queue = Queue.Queue
out_queue = Queue.Queue

class Login_Driver(threading.Thread):
def __init__(self, queue, out_queue, driver):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
self.driver = driver
print driver.title
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
driver.get(host)
chunk = driver.page_source()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
class Poster(threading.Thread):
def __init__(self, driver, out_queue):
self.out_queue = out_queue
self.driver = driver
print driver.name
def run(self):
while True:
#grabs host from queue
chunk = self.out_queue.get()
#parse the chunk
soup = BeautifulSoup(chunk)
print soup.findAll(['title'])
#signals to queue job is done
self.out_queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
driver = webdriver.Firefox()
t = Login_Driver(queue, out_queue, driver)
t.setDaemon(True)
t.start()
time.sleep(20)
#populate queue with data
for host in hosts:
queue.put(host)
for i in range(5):
dt = Poster(out_queue)
dt.setDaemon(True)
dt.start()
#wait on the queue until everything has been processed
queue.join()
out_queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)


It errors: TypeError: unbound method get() must be called with Queque instance as first argument (got nothing instead)

I'm a newbie on threads, classes, processes, can you please tell me what is more ok to use, threads or processes and if can give me an example would be great.
Thank you guys.

UPDATE

Working code:

import time
import Queue
import urllib2
import threading
from selenium import webdriver
from BeautifulSoup import BeautifulSoup

hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
"http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
out_queue = Queue.Queue()

class Login_Driver(threading.Thread):
#def __init__(self, driver):
def __init__(self, queue, out_queue, driver):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
self.driver = driver
print "In init first class.."
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
self.driver.get(host)
chunk = self.driver.page_source
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
print self.driver.title
self.queue.task_done()
class Poster(threading.Thread):
def __init__(self, out_queue, driver):
threading.Thread.__init__(self)
self.out_queue = out_queue
self.driver = driver
print "In init a second class.."
def run(self):
while True:
#grabs host from queue
chunk = self.out_queue.get()
#parse the chunk
soup = BeautifulSoup(chunk)
print soup.findAll(['title'])
#signals to queue job is done
print self.driver.name
self.out_queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
driver = webdriver.Firefox()
t = Login_Driver(queue, out_queue, driver)
t.setDaemon(True)
t.start()
print "Started webdriver: --- "+str(i)+" --- from main"
print "All started"
time.sleep(3)
#populate queue with data
for host in hosts:
queue.put(host)
print "Opening website: "+host
print "All sites passed for opening.."
time.sleep(3)
for i in range(5):
dt = Poster(out_queue, driver)
dt.setDaemon(True)
dt.start()
print "Starting second class/title and name beautifull soup and webdriver: --- "+str(i)+" --- from main"
print "Started secound class.."
time.sleep(3)
#wait on the queue until everything has been processed
queue.join()
out_queue.join()
print "out_queue.join()"
main()
print "Elapsed Time: %s" % (time.time() - start)

Answer

You need to use Queue.Queue() instead of Queue.Queue

Comments