User:Equinox/code/Antiblue

From Wiktionary, the free dictionary
Jump to navigation Jump to search
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using DotNetWikiBot;


namespace Antiblue
{
    /// <summary>
    /// Removes the unadorned "blue links" from a Wiktionary text.
    /// </summary>
    /// <remarks>
    /// This is pretty slow because it checks each (unique) word in real time, rather than using a downloaded dump.
    /// However, things change fast, and using an outdated dump is always liable to miss things.
    /// </remarks>
    class Program
    {
        private const string FILE_INPUT = @"C:\Users\home\Desktop\input.txt";

        private const string USER = "Equinox";

        private const string LANGUAGE_NAME = "English"; // null to remove existing links in any language

        private const int SLEEPY_TIME = 1000; // milliseconds between requests; avoid blasting the server too hard

        private const bool COALESCE_SPACES = true;


        private static Encoding _pageEncoding = Encoding.GetEncoding(1252); // depending on your input file

        private static WebClient _downloader = new WebClient();

        private static Site _site = null;

        private static readonly Dictionary<string, bool> _checkedWords = new Dictionary<string, bool>();

        private static readonly List<string> _letTheseLinksLive = new List<string>(); // any words not to be removed


        private static void Main(string[] args)
        {
            ServicePointManager.SecurityProtocol = (SecurityProtocolType) 3072; // TLS 1.2

            Console.WriteLine("CONFIRM BEFORE USE: Language to check: " + (LANGUAGE_NAME ?? "(any language)"));

            Console.WriteLine("Enter password for " + USER + ": ");
            string password = Console.ReadLine();
            Console.Clear();

            StartBot(USER, password);
        }


        private static void StartBot(string user, string password)
        {
            _site = new Site("https://en.wiktionary.org", user, password);

            string source = File.ReadAllText(FILE_INPUT, _pageEncoding);

            // Look for any string inside [[...]], except where the | splitter is involved.

            source = Regex.Replace(source, @"\[\[[^\]\|]*\]\]", BlueLinkEvaluator);

            if (COALESCE_SPACES)
            {
                source = Regex.Replace(source, " +", " ");
            }

            File.WriteAllText(FILE_INPUT, source, _pageEncoding);

            Console.WriteLine();
            Console.WriteLine("Press Enter to exit.");
            Console.ReadLine();
        }


        private static string BlueLinkEvaluator(Match m)
        {
            string wordWithoutBrackets = m.Value.Substring("[[".Length, m.Value.Length - "[[]]".Length);

            if (_letTheseLinksLive.Contains(wordWithoutBrackets))
            {
                return m.Value;
            }
            else if (!_checkedWords.ContainsKey(wordWithoutBrackets))
            {
                // We haven't seen this word yet, so we need to check its existence on Wiktionary.
                // Assume that "==LANGUAGE_NAME==" in the page source means we have found an existing entry.

                Console.WriteLine(wordWithoutBrackets + "...");

                Page p = new Page(_site, wordWithoutBrackets);
                p.Load();

                if (LANGUAGE_NAME == null)
                {
                    _checkedWords[wordWithoutBrackets] = !String.IsNullOrEmpty(p.text);
                }
                else // specific language
                {
                    _checkedWords[wordWithoutBrackets] = (p.text != null && p.text.Contains("==" + LANGUAGE_NAME + "=="));
                }

                Thread.Sleep(SLEEPY_TIME);
            }

            return _checkedWords[wordWithoutBrackets] ? String.Empty : m.Value;
        }
    }
}