Burak Burak - 2 months ago 18
Python Question

web scraping with beautifulsoup getting error

I'm pretty new to Python and mainly need it for getting information from website.

def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.bhphotovideo.com/c/buy/accessories/ipp/100/mnp/25/Ns/p_PRICE_2%7c0/ci/20861/pn/' + str(page) + '/N/4005352853+35'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'class': 'c5'}):
href = link.get('href')
time.sleep(0.3)
# print(href)
single_item(href)
page += 1
def single_item(item_url):
s_code = requests.get(item_url)
p_text = s_code.text
soup = BeautifulSoup(p_text, "html.parser")
upc = ('div', {'class': 'product-upc'})
for upc in soup.findAll('span', {'class': 'upcNum'}):
print(upc.string)
sku = ('span', {'data-selenium': 'bhSku'})
for sku in soup.findAll('span', {'class': 'fs16 c28'}):
print(sku.text)
price = ('span', {'class': 'price'})
for price in soup.findAll('meta', {'itemprop': 'price'}):
print(price)

outFile = open(r'C:\Users\abc.txt', 'a')
outFile.write(str(upc))
outFile.write("\n")
outFile.write(str(sku))
outFile.write("\n")
outFile.write(str(price))
outFile.write('\n')
outFile.close()

spider(1)


What i want to get is "UPC:813066012487, price:26.45 and SKU:KBPTMCC2" without any span, meta or content attributes.I attached my output below
Here is my output:
screenshot

Where do i do wrong ?
Hope someone can figure it out! Thanks!!

Answer

The data you want is in the div attribute data-itemdata, you can call json.loads and it will give you a dict that you can access to get what you want:

from bs4 import BeautifulSoup
import requests
import json

soup = BeautifulSoup(requests.get("https://www.bhphotovideo.com/c/buy/accessories/ipp/100/mnp/25/Ns/p_PRICE_2%7c0/ci/20861/pn/1/N/4005352853+35").content, "html.parser")


for d in soup.select("div[data-selenium=itemDetail]"):
    data = json.loads(d["data-itemdata"])
    print(data)

Each data dict will look like:

{u'catagoryId': u'20861',
 u'inCart': False,
 u'inWish': False,
 u'is': u'REG',
 u'itemCode': u'KBPTMCC2',
 u'li': [],
 u'price': u'26.45',
 u'searchTerm': u'',
 u'sku': u'890522'}

So just access by key i.e price = data["price"].

To get the UPC we just need to visit the items page, we can get the url from h3 with the data-selenium attribute:

for d in soup.select("div[data-selenium=itemDetail]"):
    url = d.select_one("h3[data-selenium] a")["href"]
    upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum").text.strip()
    data = json.loads(d["data-itemdata"])
Comments