pandemic pandemic - 3 months ago 21
C# Question

Web scraping a listings website

I'm trying to scrape a website - ive accomplished this on other projects but i cant seem to get this right. It could be that ive been up for over 2 days working and maybe i am missing something. Please could someone look over my code? Here it is :

using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;
using System.Xml.Linq;
using System.IO;

public partial class _Default : System.Web.UI.Page
{
List<string> names = new List<string>();
List<string> address = new List<string>();
List<string> number = new List<string>();
protected void Page_Load(object sender, EventArgs e)
{
string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + "4";
var Webget = new HtmlWeb();
var doc = Webget.Load(url);
List<List<string>> mainList = new List<List<string>>();

foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h2//a"))
{
names.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[@class='result-address']"))
{
address.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[@class='result-number']"))
{
number.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}

XDocument doccy = new XDocument(

new XDeclaration("1.0", "utf-8", "yes"),

new XComment("Business For Sale"),

new XElement("Data",

from data in mainList
select new XElement("data", new XAttribute("data", "data"),
new XElement("Name : ", names[0]),
new XElement("Add : ", address[0]),
new XElement("Number : ", number[0])
)
)

);

var xml = doccy.ToString();

Response.ContentType = "text/xml"; //Must be 'text/xml'
Response.ContentEncoding = System.Text.Encoding.UTF8; //We'd like UTF-8
doccy.Save(Response.Output); //Save to the text-writer

}

}


The website lists business name, phone number and address and they are all defined by a class name (result-address, result-number etc). I am trying to get XML output so i can get the business name, address and phone number from each listing on page 4 for a presentation tomorrow but i cant get it to work at all!

The results are right in all 3 of the for each loops but they wont output in the xml i get an out of range error.

Answer

My first piece of advice would be to keep your CodeBehind as light as possible. If you bloat it up with business logic then the solution will become difficult to maintain. That's off topic, but I recommend looking up SOLID principles.

First, I've created a custom object to work with instead of using Lists of strings which have no way of knowing which address item links up with which name:

public class Listing
{
    public string Name { get; set; }
    public string Address { get; set; }
    public string Number { get; set; }
}

Here is the heart of it, a class that does all the scraping and serializing (I've broken SOLID principles but sometimes you just want it to work right.)

using System.Collections.Generic;
using HtmlAgilityPack;
using System.IO;
using System.Xml;
using System.Xml.Serialization;
using System.Linq;
public class TheScraper
{
    public List<Listing> DoTheScrape()
    {
        List<Listing> result = new List<Listing>();

        string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + "4";

        var Webget = new HtmlWeb();
        var doc = Webget.Load(url);

        // select top level node, this is the closest we can get to the elements in which all the listings are a child of.
        var nodes = doc.DocumentNode.SelectNodes("//*[@id='list']/div/div/div/div");

        // loop through each child 
        if (nodes != null)
        {
            foreach (var node in nodes)
            {
                Listing listing = new Listing();

                // get each individual listing. the "?" operator will act like a null check, returning null instead of exceptions.
                listing.Name = node.SelectSingleNode("./div/div/div/div/h2/a")?.InnerText;
                listing.Address = node.SelectSingleNode("./div/div/div/div/p[@class='result-address']")?.InnerText.Trim();
                listing.Number = listing.Number = node.SelectSingleNode("./div/div/div/div/p[@class='result-number']/a")?.Attributes["data-visible-number"].Value;

                result.Add(listing);
            }
        }

        // filter out the nulls
        result = result.Where(x => x.Name != null && x.Address != null && x.Number != null).ToList();

        return result;
    }

    public string SerializeTheListings(List<Listing> listings)
    {
        var xmlSerializer = new XmlSerializer(typeof(List<Listing>));

        using (var stringWriter = new StringWriter())
        using (var xmlWriter = XmlWriter.Create(stringWriter, new XmlWriterSettings { Indent = true }))
        {
            xmlSerializer.Serialize(xmlWriter, listings);
            return stringWriter.ToString();
        }
    }
}

Then your code behind would look something like this, plus references to the scraper class and model class:

public partial class _Default : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {
        TheScraper scraper = new TheScraper();
        List<Listing> listings = scraper.DoTheScrape();
        string xmlListings = scraper.SerializeTheListings(listings);
    }
}