User:Erutuon/scripts/fix Cyrillic.py

From Wiktionary, the free dictionary
Jump to navigation Jump to search
#! /usr/bin/env python3

from sys import argv
import json
import regex as re
from pywikibot import Page, Site
import mwparserfromhell
from unicodedata import name as character_name, normalize

if len(argv) != 2:
    raise ValueError("expected one commandline argument: filename")

print("filename:", argv[1])
text = open("wrong_script/" + argv[1], "r").read()

site = Site(code="en", fam="wiktionary")

# from https://en.wikipedia.org/wiki/User:Trey314159/homoglyphHunter.js
# "I":"І", -> "I":"ӏ",
Latin_to_Cyrillic = {
    "a":"а", "A":"А", "ă":"ӑ", "Ă":"Ӑ", "ä":"ӓ", "Ä":"Ӓ", "æ":"ӕ", "Æ":"Ӕ",
    "B":"В", "c":"с", "C":"С", "ç":"ҫ", "Ç":"Ҫ", "e":"е", "E":"Е", "è":"ѐ",
    "È":"Ѐ", "ë":"ё", "Ë":"Ё", "ĕ":"ӗ", "Ĕ":"Ӗ", "ə":"ә", "Ə":"Ә", "H":"Н",
    "i":"і", "I":"ӏ", "ï":"ї", "Ï":"Ї", "ḯ":"ї́", "Ḯ":"Ї́", "j":"ј", "J":"Ј",
    "k":"к", "K":"К", "M":"М", "m":"м", "o":"о", "O":"О", "ö":"ӧ", "Ö":"Ӧ", "p":"р",
    "P":"Р", "Q":"Ԛ", "s":"ѕ", "S":"Ѕ", "T":"Т", "W":"Ԝ", "x":"х", "X":"Х",
    "y":"у", "Y":"У", "ȳ":"ӯ", "Ȳ":"Ӯ", "ÿ":"ӱ", "Ÿ":"Ӱ", "á":"а́", "é":"е́",
    "í":"і́", "ó":"о́", "ý":"у́", "ħ":"ћ", "ɜ":"з", "ò":"о̀", "Ò":"О̀", "l":"ӏ",
    "h":"һ", "ā":"а̄", "Ā":"А̄", "ē":"е̄", "Ē":"Е̄", "ī":"і̄", "ō":"о̄", "Ō":"О̄",
    "ō":"о̄", "Ō":"О̄",
}

single_char = re.compile(".", re.DOTALL)
def replace_Latin_with_Cyrillic (text):
    # desired character:
    # ҫ (CYRILLIC SMALL LETTER ES WITH DESCENDER)
    # character resulting from replacement after decomposition:
    # с̧ (CYRILLIC SMALL LETTER ES, COMBINING CEDILLA)
    text = single_char.sub(lambda char: Latin_to_Cyrillic[char[0]]
                                        if char[0] in Latin_to_Cyrillic else char[0],
                                        text)
    text = normalize("NFD", text)
    text = single_char.sub(lambda char: Latin_to_Cyrillic[char[0]]
                                        if char[0] in Latin_to_Cyrillic else char[0],
                                        text)
    return normalize("NFC", text)

def show_graphemes (graphemes):
    return ", ".join("[[" + grapheme[0] + "]] with "
                                 + (" and ").join([ character_name(char).lower() for char in grapheme[1:] ])
                             if len(grapheme) > 1
                             else "[[" + grapheme + "]]" for grapheme in graphemes)

def iterate_template_data (text, skip_to_title):
    start_processing = skip_to_title == None
    
    for line in text.splitlines():
        data = json.loads(line)
        title = data["title"]
        if not start_processing:
            if title == skip_to_title:
                start_processing = True
            else:
                continue
        
        yield title, data["templates"]

def process_pages (text, skip_to):
    only_Latin = re.compile(r"^[\p{Latn}\p{Zinh}\p{Zyyy}]+$")
    
    for (title, templates) in iterate_template_data(text, skip_to):
        print("title: [[{}]]".format(title))
        
        corrections = []
        
        # Avoid loading page if no changes need to be made.
        page = None
        old_text = None
        
        for instance in templates:
            template_text = instance["template"]
            wikitext = mwparserfromhell.parse(template_text)
            template = None
            
            try:
                template = wikitext.get(0)
            except:
                print("no template")
                continue
            
            language_code = instance["lang"]
            link_target = instance["text"]
            link_target_param = instance["param"]
            
            if not (language_code and link_target and link_target_param):
                print("missing language code or term")
                continue
            
            link_target_corrected = replace_Latin_with_Cyrillic(link_target)
            
            if link_target != link_target_corrected:
                if not only_Latin.match(link_target):
                    if page == None:
                        page = Page(site, title)
                        if page.isRedirectPage():
                            page = page.getRedirectTarget()
                            new_title = page.title()
                            print("Followed redirect from [[" + title + "]] to [[" + new_title + "]]")
                            title = new_title
                        old_text = page.text
                    
                    print("{} \N{RIGHTWARDS ARROW} {} ({})".format(link_target,
                                                                   link_target_corrected,
                                                                   language_code))
                    template.add(link_target_param, link_target_corrected)
                    
                    page.text = page.text.replace(template_text, str(template))
                    
                    for x in link_target:
                        if x in Latin_to_Cyrillic:
                            corrections.append((x, Latin_to_Cyrillic[x]))
                else:
                    print("'{}' only contains Latin, so will not be modified".format(link_target))
            else:
                print("could not correct", template_text, "automatically")
        
        corrections = list(zip(*list(corrections)))
        
        if len(corrections) == 2 and page.text != old_text:
            correction_len = 0
            
            Latin = corrections[0]
            Greek = corrections[1]
            
            if all([x == corrections[0][0] for x in corrections[0]]) and all([x == corrections[1][0] for x in corrections[1]]):
                correction_len = len(corrections[0])
                
                Latin = [ Latin[0] ]
                Greek = [ Greek[0] ]
            else:
                Latin = list(Latin)
                Cyrillic = list(Latin)
            
            Latin = show_graphemes(Latin)
            Cyrillic = show_graphemes(Greek)
            
            len_str = " " + str(correction_len) + " times" if correction_len > 1 else ""
            summary = "replaced Latin {} with Cyrillic {}{}".format(Latin, Cyrillic, len_str)
            print("> summary:", summary)
            
            while True:
                answer = input("> Save edit? y/n (or quit: q)\n>>> ")
                
                if len(answer) > 0:
                    answer = answer[0].lower()
                    
                    if answer == "y":
                        page.save(summary=summary, minor=True, watch="watch")
                        print("")
                        break
                    elif answer == "q":
                        print("> quitting")
                        return title
                    elif answer == "n":
                        print("")
                        break
                    else:
                        print("> Answer not recognized.")
        else:
            print("> no changes\n")
    else:
        print("done!")

try:
    skip_to = None
    try:
        last_saved = open("last_saved.txt", "r")
        lines = [line for line in last_saved]
        skip_to = lines[len(lines) - 1]
        print("skipping to [[{}]]\n".format(skip_to))
    except:
        print("no page to skip to")
    
    title = process_pages(text, skip_to)
    
    if title:
        last_saved = open("last_saved.txt", "w")
        last_saved.write(title)
except Exception as e:
    print(e)
    print("quitting")