ponyboil ponyboil - 2 months ago 40
C# Question

re-writing large XML Files - excluding certain node

I would like to re-write a large xml without some of its nodes.
I'm trying to read an XML file (100s of MBs, can't read it all to memory) line by line using system.xml.xmlreader - struggling to find a way to read parts of it, write them down to a separate xDocument and then save that xDocument to the disk.

What i've been thinking is something like:

using (XmlReader reader = XmlReader.Create(_xml_path))
{
using (XmlWriter writer = XmlWriter.Create(@"filteredxml.xml"))
{
reader.MoveToContent();

while (reader.Read())
{
if (reader.NodeType == XmlNodeType.Element)
{
if (reader.Name != "EL_TO_BE_REMOVED")
{
//writer.WriteNode(reader.ReadOuterXml());

}
}
}
}
}


but reader.ReadOuterXml() simply goes to the first element and writes all its descendants to a file, without letting me filter for elements I wish to ignore.

Answer

In case of a big file and memory constraints, you should parse with SAX instead of DOM: the XMLReader is the C# equivalent indeed.

This could be a basic approach with a XMLReader for the input, a XMLWriter for the output and a counter to remove nodes named RemoveMe (with all their content).

Notice the internal loop to clone the attributes per each relevant element.

        using (XmlReader reader = XmlReader.Create(OriginalXml))
        {
            XmlWriterSettings ws = new XmlWriterSettings();
            ws.Indent = true;
            using (XmlWriter writer = XmlWriter.Create(FilteredXml, ws))
            {
                int skip = 0;
                while (reader.Read())
                {
                    switch (reader.NodeType)
                    {
                        case XmlNodeType.Element:
                            skip += reader.Name.Equals(RemoveMe) ? 1 : 0;
                            if (skip == 0)
                            {
                                writer.WriteStartElement(reader.Name);
                                while (reader.MoveToNextAttribute())
                                    writer.WriteAttributeString(reader.Name, reader.Value);
                            }

                            break;
                        case XmlNodeType.Text:
                            if (skip == 0)
                            {
                                writer.WriteString(reader.Value);
                            }
                            break;
                        case XmlNodeType.XmlDeclaration:
                        case XmlNodeType.ProcessingInstruction:
                            if (skip == 0)
                            {
                                writer.WriteProcessingInstruction(reader.Name, reader.Value);
                            }   
                            break;
                        case XmlNodeType.Comment:
                            if (skip == 0)
                            {
                                writer.WriteComment(reader.Value);
                            }
                            break;
                        case XmlNodeType.EndElement:
                            if (skip == 0)
                            {
                                writer.WriteFullEndElement();
                            }
                            skip -= reader.Name.Equals(RemoveMe) ? 1 : 0;
                            if (skip < 0)
                            {
                                throw new Exception("wrong sequence");
                            }
                            break;
                    }
                }

            }
        }