User:Flubot/tools

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Find page sections[edit]

This simple tool examines a certain xml file, obtained through Special:Export, finds and prints all entries contained with more than one language header. It's useful, for example, to know if we can move a certain Romanian entry with ş or ţ to a new name with ș, ț.

cat roverbs.xml | ./lang_headers.py
 

lang_headers.py[edit]

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys, re

fin = sys.stdin

page_tag = re.compile('<page>')
title_tag = re.compile('<title>')
lang_tag = re.compile('==([A-za-z ]+)==$')
title_content = re.compile('<title>([^:]+):(.+)<\/title>')
title_capture = re.compile('<title>(.*)<\/title>')
comment_tag = re.compile('<comment>(.*)<\/comment>')
page_tag_end = re.compile('<\/page>')
main_string = re.compile('main')

eof=0

while not eof:
    line = fin.readline()
    if line == "":
       eof = 1
    elif page_tag.search(line):
        namespace=""
        title=""
	langs=0
	section = [ ]
    elif title_tag.search(line):
           result = title_content.search(line)
           if result:
               namespace=result.group(1)
               title=result.group(2)
           else:
               result = title_capture.search(line)
               if result:
                 namespace="main"
                 title=result.group(1)
    elif comment_tag.search(line):
	result = comment_tag.search(line)
    elif lang_tag.search(line):
	result = lang_tag.search(line)
	if result:
		section.append(result.group(1))
        	langs= langs+1
    elif page_tag_end.search(line):
        if langs > 1 and main_string.search(namespace):
            print(title), " ",
	    for j in range(0,langs):	
		print(section[j])," ",
	    print
fin.close()