Iman Memarpour Iman Memarpour - 1 month ago 11
C# Question

Reading XML and storing it in SQL Server. Getting duplicates

I am trying to read XML feed from the URL and store it in the database. The XML format looks like this:

<response version="2">
<totalresults>1249943</totalresults>
<results>
<result>
<jobtitle>Call Center </jobtitle>
<company>CVS Health</company>
<city>Work at Home</city>
</result>

<result>
<jobtitle>Java Programmer</jobtitle>
<company>Jonah Group</company>
<city>Toronto</city>
</result>
</results>
</response>


And I am trying to store job title, company, and city for all the jobs. There are millions of jobs. Here is my code in C#

public override void getJobsFromSource()
{
string url = @"http://api.indeed.com/ads/apisearch?publisher=5566998848654317&v=2&q=%22%22&filter=1%22%22&limit=25";
XmlDocument doc = new XmlDocument();
doc.Load(url);
int totalResults = int.Parse(doc.SelectSingleNode("response /totalresults").InnerText);

for (int i = 0; i < totalResults; i += 25)
{
string newUrl = $@"http://api.indeed.com/ads/apisearch?publisher=5566998848654317&v=2&q=%22%22&filter=1&limit=25&start={i}";
doc.Load(newUrl);
DataSet ds = new DataSet();
XmlNodeReader xmlReader = new XmlNodeReader(doc);

while (xmlReader.ReadToFollowing("results"))
{
ds.ReadXml(xmlReader);
}

if (ds.Tables.Count > 0)
{
SqlConnection con = new SqlConnection();
con.ConnectionString = "data source=10.0.0.76;initial catalog=JobSearchDB;persist security info=True;user id=sa;password=bonddbl07;MultipleActiveResultSets=True;App=EntityFramework";
con.Open();

SqlBulkCopy sbc = new SqlBulkCopy(con);
sbc.DestinationTableName = "IndeedJob";

sbc.ColumnMappings.Clear();
sbc.ColumnMappings.Add("jobtitle", "jobtitle");
sbc.ColumnMappings.Add("company", "company");
sbc.ColumnMappings.Add("city", "city");
sbc.WriteToServer(ds.Tables[0]);
con.Close();
}
}
}


The problem is that while jobs are unique, I am getting many duplicates in my tables. Duplicates come in random numbers whenever I run the program. Where am going wrong?

Answer

The webpage definitely has duplicates. I verified with code below. The webpage doesn't appear to have well formed xml so I had to modify your code to be able to read webpage. Using Linq I was able to remove the duplicates.

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Xml;
using System.Xml.Schema;

namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        DataSet ds = new DataSet("Jobs");
        public Form1()
        {
            InitializeComponent();
            getJobsFromSource();
            DataTable dt = ds.Tables[0];
            dt = dt.AsEnumerable().GroupBy(x => x.Field <string>("jobkey")).Select(x => x.FirstOrDefault()).OrderBy(y => y.Field<string>("jobkey")).CopyToDataTable();
            dataGridView1.DataSource = dt;
        }
        public void getJobsFromSource()
{
            string url = @"http://api.indeed.com/ads/apisearch?publisher=5566998848654317&v=2&q=%22%22&filter=1%22%22&limit=25";
            XmlDocument doc = new XmlDocument();
            doc.Load(url);
            int totalResults = int.Parse(doc.SelectSingleNode("response /totalresults").InnerText);
            for (int i = 0; i < totalResults; i += 25)
            {
                string newUrl = @"http://api.indeed.com/ads/apisearch?publisher=5566998848654317&v=2&q=%22%22&filter=1&limit=25&start={i}";

                XmlReaderSettings settings = new XmlReaderSettings();
                settings.ValidationType = ValidationType.None;
                settings.IgnoreWhitespace = true;
                XmlReader xmlReader = XmlReader.Create(newUrl, settings);

                while (!xmlReader.EOF)
                {
                    if (xmlReader.Name != "result")
                    {
                        xmlReader.ReadToFollowing("result");
                    }
                    if(!xmlReader.EOF)
                    {
                        ds.ReadXml(xmlReader);
                    }
                }
            }
       }
    }
}
Comments