Renu sharma Renu sharma -4 years ago 48
Python Question

Web scraping to get data from website

I am learning python & trying to scrape a website, having 10 listing of properties on each page. I want to extract information from each listing on each page.
My code for first 5 pages is as follows :-

import requests
from bs4 import BeautifulSoup

urls = []
for i in range(1,5):
pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
urls.append(pages)
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
Data = []
for urls in hrefs:
pages = requests.get(urls)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)


The above code is not working for me. Please let me know the correct coding to achieve the purpose.

Answer Source

There is one problem in your code is that you declared the variable "urls" twice. You need to update the code like below:

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,6):
    pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
    urls.append(pages)

Data = []
for info in urls:
    page = requests.get(info)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', attrs ={'class' :'details-panel'})
    hrefs = [link['href'] for link in links]

    for href in hrefs:
        pages = requests.get(href)
        soup_2 =BeautifulSoup(pages.content, 'html.parser')
        Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
        Address = [Address.text.strip() for Address in Address_1]
        Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
        Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
        Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
        Area = [Area.text.strip() for Area in Area_1]
        Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
        Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
        Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
        Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
        Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

print Data
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download