User:Equinox/code/SplitWiktDumpXml
Jump to navigation
Jump to search
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace SplitWiktDumpXml
{
/// <summary>
/// Reads a Wiktionary XML dump of the type "All pages, current versions only" and splits it
/// into individual XML files, one per page -- but preserving only English mainspace entries.
/// </summary>
/// <remarks>
/// Be sure you want to run this. It creates a huge number of files and your OS will cry.
/// </remarks>
class Program
{
private const string INPUT_FILE = @"c:\Users\home\Desktop\enwiktionary-20161001-pages-meta-current.xml";
private const string OUTPUT_FOLDER = @"c:\Users\home\Desktop\output";
private static string UnescapeXmlString(string s)
{
// Unescape the five "predefined entities" of XML.
// Do the ampersand last to ensure it cannot create false positives for any of the others!
return s.Replace("<", "<").Replace(">", ">").Replace(""", "\"").Replace("'", "'").Replace("&", "&");
}
static void Main(string[] args)
{
Directory.CreateDirectory(OUTPUT_FOLDER);
using (StreamReader reader = new StreamReader(INPUT_FILE, Encoding.UTF8))
{
bool inPage = false, foundEnglishHeader = false, foundMainspace = false;
bool justSeenNamespace = false;
int? pageId = null;
string curLine = null, pageSoFar = null, pageTitle = null;
while ((curLine = reader.ReadLine()) != null)
{
string trimmed = curLine.Trim();
if (trimmed == "<page>")
{
if (pageSoFar != null) throw new Exception(); // shouldn't happen
inPage = true;
}
if (inPage)
{
pageSoFar += curLine + Environment.NewLine;
// We're not parsing the XML properly, but in practice this is probably good enough:
if (!foundEnglishHeader && curLine.Contains("==English=="))
{
foundEnglishHeader = true;
}
if (justSeenNamespace && trimmed.StartsWith("<id>") && trimmed.EndsWith("</id>"))
{
// Lame hack: we rely on the <id> line always immediately following the <ns> line.
// Otherwise we would match other, unwanted IDs that are nested in sub-elements.
pageId = Convert.ToInt32(trimmed.Replace("<id>", String.Empty).Replace("</id>", String.Empty));
}
if (!foundMainspace && trimmed == "<ns>0</ns>")
{
foundMainspace = true;
justSeenNamespace = true;
}
else
{
justSeenNamespace = false;
}
if (pageTitle == null
&& trimmed.StartsWith("<title>", StringComparison.Ordinal)
&& trimmed.EndsWith("</title>", StringComparison.Ordinal))
// ^ note: without StringComparison.Ordinal, we fail on e.g. Hawaiian okina character
{
pageTitle = trimmed.Replace("<title>", String.Empty).Replace("</title>", String.Empty);
pageTitle = UnescapeXmlString(pageTitle);
}
if (trimmed == "</page>")
{
if (foundMainspace && foundEnglishHeader)
{
if (pageId == null || pageTitle == null) throw new Exception(); // shouldn't happen
Console.WriteLine(pageId + ": " + pageTitle);
// This is a page we want, so write it to disk with page ID as filename.
string file = Path.Combine(OUTPUT_FOLDER, pageId + ".txt");
File.WriteAllText(file, pageSoFar);
}
pageId = null;
pageSoFar = pageTitle = null;
foundEnglishHeader = foundMainspace = false;
inPage = false;
}
}
}
}
Console.Beep();
Console.WriteLine("Press Enter to exit.");
Console.ReadLine();
}
}
}