anonymous anonymous - 2 months ago 8
Python Question

why my links not writing in my file

import urllib
from bs4 import BeautifulSoup
import requests
import readability
import time
import http.client

seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"
max_limit=5
#file = open("file_crawled.txt", "w")

def get_urls(seed_url):
r = requests.get(seed_url)
soup = BeautifulSoup(r.content,"html.parser")
links = soup.findAll('a', href=True)
valid_links=[]
for links in links:
if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
valid_links.append(root_url + links['href'])
return valid_links


visited=[]
def crawl_dfs(seed_url, max_depth):
depth=1
file1 = open("file_crawled.txt", "w+")
visited.append(root_url)
if depth<=max_depth:
children=get_urls(seed_url)
for child in children:
if child not in visited:
file1.write(child)
time.sleep(1)
visited.append(child)
crawl_dfs(child,max_depth-1)
file1.close()

crawl_dfs(seed_url,max_limit)


dfs crawling use python 3.6
help me with the code, please correct where i am wrong, my crawled links are not writing to my file named file1. i dont know why i have tried everything at my end

Answer

You have to open and close file only once - open before first crawl_dfs() and close after first crawl_dfs()

Tested:

import urllib
from bs4 import BeautifulSoup
import requests
#import readability
import time
import http.client

# --- functions ---

def get_urls(seed_url):
    r = requests.get(seed_url)
    soup = BeautifulSoup(r.content,"html.parser")
    links = soup.findAll('a', href=True)
    valid_links = []
    for links in links:
        if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
            valid_links.append(root_url + links['href'])
    return valid_links


def crawl_dfs(seed_url, max_depth, file_out):
    if max_depth >= 1:
       children = get_urls(seed_url)
       for child in children:
           if child not in visited:          
               file_out.write(child + "\n")                                    
               #time.sleep(1)
               visited.append(child)
               crawl_dfs(child, max_depth-1, file_out)

# --- main ---

seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"
max_limit = 1

visited=[root_url]

file1 = open("file_crawled.txt", "w+")

crawl_dfs(seed_url, max_limit, file1)

file1.close()