Nandish Nandish - 1 year ago 90
Python Question

Want to scrape data using 10 different keywords in URL for 2 pages and write scraped data to csv using Python 3.6.2 and BS4

I have the code ready for one keyword and its working fine. Next problem is I want to do the scrape for 10 different keywords and save them in one csv file with the keyword name on column/row. I think we can give csv file as input and it picks keyword one by one and does scrape. Here is the code:

import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "http://www.amazon.in/s/ref=sr_pg_2?
rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,3):
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Asin', 'Result'])
df.to_csv('hel.csv')

Answer Source

I made some sample keywords, replace on needed ones.

import requests    
from bs4 import BeautifulSoup
import pandas as pd

base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ie=UTF8"

keywords_list = ['helmets for men', 'helmets for women']
keyword = 'helmets for men'
#excluding page from base_url for further adding
res = []
for page in range(1,3): 
    for keyword in keywords_list:
        request = requests.get(base_url + '&keywords=' + requests.utils.quote(keyword) + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page    
        if request.status_code == 404: #added just in case of error
            break
        soup = BeautifulSoup(request.content, "lxml")
        for url in soup.find_all('li', class_ = 's-result-item'):
            res.append([url.get('data-asin'), url.get('id'), keyword])

df = pd.DataFrame(data=res, columns=['Asin', 'Result', 'keyword'])    
df.to_csv('hel.csv')
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download