anonymous anonymous - 2 months ago 15
Python Question

My Breadth-first-search not working

import urllib
from bs4 import BeautifulSoup
import requests
import time
import http.client

seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"

#url_list = list()


--- functions ---



def get_urls(seed_url):
r = requests.get(seed_url)
time.sleep(1)
soup = BeautifulSoup(r.content,"html.parser")
links = soup.findAll('a', href=True)
valid_links=[]
for links in links :
if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
valid_links.append(root_url + links['href'])
return valid_links


def crawl(seed_url, depth, file_out):
counter=0
while counter < depth:
children = get_urls(seed_url)
for child in children:
if child not in visited:
if len(visited) <= 1000:
visited.append(child)
print(child)
file_out.write(child + "\n")
counter+=1


--- main ---



seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"

visited=[root_url]
depth=1
file1 = open("unfocused_crawled.txt", "w+")

crawl(seed_url, 5, file1)

file1.close()


Breadth-first-search
My B-F-S not working, its only crawling the first child but not getting back to the second link and crawling under it.
help me where i am going wrong. if someone can change my code and help me it. please help me

Answer

You never update seed_url in crawl(), so each time through the while loop the seed_url is "https://en.wikipedia.org/wiki/Sustainable_energy"

In BFS, you should keep a queue of items to visit. Pull an item off the front of the queue and add its children to the end. In Python you can use a list as a queue, as described here.

The below code does the trick. Note that this does no checking that anything is less than 1,000. Feel free to add that where you wish.

from bs4 import BeautifulSoup
import requests
import time
from collections import deque


class Node:

    def __init__(self, url, depth):

        self.url = url
        self.depth = depth


def get_urls(seed_url):
    r = requests.get(seed_url)
    time.sleep(1)
    soup = BeautifulSoup(r.content, "html.parser")
    links = soup.findAll('a', href=True)
    valid_links = []
    for links in links:
        if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
            valid_links.append(root_url + links['href'])
    return valid_links


def crawl(seed_url, depth, file_out):

    # initialize queue of links to visit, and list of links visited
    to_visit = deque([Node(seed_url, 0)])
    visited = list()

    # iterate through links to visit while there are still lengths and while we haven't hit max depth
    # since we are doing BFS, the first time we hit a depth that is too high, we know all remaining nodes
    # in queue have a depth of at least that
    while len(to_visit) > 0 and to_visit[0].depth < depth:

        # pop the node off of the queue
        node = to_visit.popleft()

        # get its children urls
        children_url = get_urls(node.url)

        # mark this node as visited
        visited.append(node.url)
        print(node.url)

        # iterate through its children to add to end of queue 
        for child_url in children_url:

            # make sure that you aren't already going to visit the node and that you haven't visited it
            if child_url not in visited and child_url not in (x.url for x in to_visit):

                # add it to the end of the queue of nodes to visit
                # note that we increment the depth by 1
                to_visit.append(Node(child_url, node.depth + 1))

                # write it to a file (you do this)
                file_out.write(child_url + "\n")

seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"
file1 = open("unfocused_crawled.txt", "w+")
crawl(seed_url, 5, file1)
file1.close()