User:Equinox/code/SplitWiktDumpXml

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;


namespace SplitWiktDumpXml
{
    /// <summary>
    /// Reads a Wiktionary XML dump of the type "All pages, current versions only" and splits it
    /// into individual XML files, one per page -- but preserving only English mainspace entries.
    /// </summary>
    /// <remarks>
    /// Be sure you want to run this. It creates a huge number of files and your OS will cry.
    /// </remarks>
    class Program
    {
        private const string INPUT_FILE = @"c:\Users\home\Desktop\enwiktionary-20161001-pages-meta-current.xml";
        private const string OUTPUT_FOLDER = @"c:\Users\home\Desktop\output";


        private static string UnescapeXmlString(string s)
        {
            // Unescape the five "predefined entities" of XML.
            // Do the ampersand last to ensure it cannot create false positives for any of the others!

            return s.Replace("&lt;", "<").Replace("&gt;", ">").Replace("&quot;", "\"").Replace("&apos;", "'").Replace("&amp;", "&");
        }


        static void Main(string[] args)
        {
            Directory.CreateDirectory(OUTPUT_FOLDER);

            using (StreamReader reader = new StreamReader(INPUT_FILE, Encoding.UTF8))
            {
                bool inPage = false, foundEnglishHeader = false, foundMainspace = false;
                bool justSeenNamespace = false;
                int? pageId = null;
                string curLine = null, pageSoFar = null, pageTitle = null;

                while ((curLine = reader.ReadLine()) != null)
                {
                    string trimmed = curLine.Trim();

                    if (trimmed == "<page>")
                    {
                        if (pageSoFar != null) throw new Exception(); // shouldn't happen

                        inPage = true;
                    }

                    if (inPage)
                    {
                        pageSoFar += curLine + Environment.NewLine;

                        // We're not parsing the XML properly, but in practice this is probably good enough:
                        if (!foundEnglishHeader && curLine.Contains("==English=="))
                        {
                            foundEnglishHeader = true;
                        }

                        if (justSeenNamespace && trimmed.StartsWith("<id>") && trimmed.EndsWith("</id>"))
                        {
                            // Lame hack: we rely on the <id> line always immediately following the <ns> line.
                            // Otherwise we would match other, unwanted IDs that are nested in sub-elements.
                            pageId = Convert.ToInt32(trimmed.Replace("<id>", String.Empty).Replace("</id>", String.Empty));
                        }

                        if (!foundMainspace && trimmed == "<ns>0</ns>")
                        {
                            foundMainspace = true;
                            justSeenNamespace = true;
                        }
                        else
                        {
                            justSeenNamespace = false;
                        }

                        if (pageTitle == null
                            && trimmed.StartsWith("<title>", StringComparison.Ordinal)
                            && trimmed.EndsWith("</title>", StringComparison.Ordinal))
                            // ^ note: without StringComparison.Ordinal, we fail on e.g. Hawaiian okina character
                        {
                            pageTitle = trimmed.Replace("<title>", String.Empty).Replace("</title>", String.Empty);
                            pageTitle = UnescapeXmlString(pageTitle);
                        }

                        if (trimmed == "</page>")
                        {
                            if (foundMainspace && foundEnglishHeader)
                            {
                                if (pageId == null || pageTitle == null) throw new Exception(); // shouldn't happen

                                Console.WriteLine(pageId + ": " + pageTitle);

                                // This is a page we want, so write it to disk with page ID as filename.
                                string file = Path.Combine(OUTPUT_FOLDER, pageId + ".txt");
                                File.WriteAllText(file, pageSoFar);
                            }

                            pageId = null;
                            pageSoFar = pageTitle = null;
                            foundEnglishHeader = foundMainspace = false;

                            inPage = false;
                        }
                    }
                }
            }

            Console.Beep();
            Console.WriteLine("Press Enter to exit.");
            Console.ReadLine();
        }
    }
}
User:Equinox/code/SplitWiktDumpXml

Navigation menu

Search