I am trying to scrape the companies in the li in the ul table under final result. The source code looks like this
import string
import re
import urllib2
import datetime
import bs4
from bs4 import BeautifulSoup
class AJSpider(object):
def __init__(self):
print ("initisizing")
self.date = str(datetime.date.today())
self.cur_url = "https://youinvest.moneyam.com/modules/forward-diary/?date={date}&period=month"
self.datas = []
print ("initisization done")
def get_page(self,cur_date):
url = self.cur_url
try:
my_page = urllib2.urlopen(url.format(date = cur_date)).read().decode("utf-8")
my_soup = BeautifulSoup(my_page, "html.parser")
except:
print ('Failed')
return my_soup
def get_final(self, soup_page):
temp_data = []
final_result_section = soup_page.find("h3", text="Final Result")
print final_result_section
def start_spider(self):
my_page = self.get_page(self.date)
self.get_final(my_page)
def main():
my_spider = AJSpider()
my_spider.start_spider()
if __name__ == '__main__':
main()
Find the h3
element by text and get the following ul
list:
ul = soup.find("h3", text="Final Result").find_next_sibling("ul")
for li in ul.find_all("li"):
print(li.span.get_text(), li.a.get_text())
Note that in the recent versions of BeautifulSoup, text
argument was renamed to string
, but they both work because of the backwards compatibility.