User:IcqBOT/code

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This code is released under the GPL v3 or later.

It first reads in a text file to create a huge dictionary with 11000 entries and a lot of structure underneath, which then gets pickled. The second part then starts consuming this dictionary, converting it to en.wiktionary.org format. This is the code that IcqBOT is actually running. The data dictionary is being emptied as entering that data into Wiktionary progresses.

--Polyglot 21:49, 6 February 2009 (UTC)

#!/usr/bin/python
# -*- coding: utf-8  -*-

import sys, codecs, re, pickle, pw, os, mwclient
site = mwclient.Site('en.wiktionary.org')
site.login(u'IcqBOT',pw.pw)

fn="tlpickle"

if len(sys.argv) > 1:
    filename=sys.argv[1].decode('utf-8')

    fileObj = codecs.open( filename, "r", "utf-8" )
    headwords={}

    regex=ur"""(?xu)
    (?:
      (?P<type>\*|::|\+\+)
      \s*
      (?:
      (?P<rndbropn>\()
      (?P<headword>[^\)]+)
      (?P<rndbrcls>\))
      )*
      \s*
      (?P<def>[^\{]+)
      \s*
      (?:\{.*?\})*
    )
    |
    (?:
      (?P<TorE>T|E)
      (?P<seq>\d)
      \)
      \s*
      (?P<example>.*)
    )
    |
    (?:
      (?:-)*
      (?P<prefix>Pas|Akt|[A-Z]+)*
      :
      \s*
      (?P<verbforms>.*)
    )
    $"""

    regexRE=re.compile(regex)
    headwords={}

    prefixindex=1 ; prefixlist = []
    for line in fileObj:
        if line:
            print line
            m=regexRE.search(line)
            if m:
                REdict=m.groupdict()
                #print REdict
                if REdict.get('type'):
                    if REdict['type'] == '*':
                        hw=REdict['def'].strip()
                        headwords[hw]={}
                    elif REdict['type'] =='::':
                        if REdict.get('headword'):
                            if 'related' not in headwords[hw]:
                                headwords[hw]['related']=[]
                            headwords[hw]['related'].append(REdict['headword'].strip())
                            if REdict['headword'] not in headwords:
                                headwords[REdict['headword']]={}
                            headwords[REdict['headword']]['noun']=REdict['def'].strip()
                        else:
                            headwords[hw]['noun']=REdict['def'].strip()
                    elif REdict['type'] =='++':
                        if REdict.get('headword'):
                            if 'related' not in headwords[hw]:
                                headwords[hw]['related']=[]
                            headwords[hw]['related'].append(REdict['headword'].strip())
                            if REdict['headword'] in headwords:
                                headwords[REdict['headword']]={}
                            headwords[REdict['headword']]['adj']=REdict['def'].strip()
                        else:
                            headwords[hw]['adj']=REdict['def'].strip()
                elif REdict.get('verbforms'):
                    if REdict.get('prefix'):
                        prefixlist.append(REdict['prefix'])
                        if 'verbforms' not in headwords[hw]:
                            headwords[hw]['verbforms']={}
                        headwords[hw]['verbforms'][REdict['prefix']]=REdict['verbforms'].strip()
                    else:
                        if 'translations' not in headwords[hw]['verbforms']:
                            headwords[hw]['verbforms']['translations']= {}
                        headwords[hw]['verbforms']['translations'][prefixindex] = REdict['verbforms'].strip()
                        if 'prefixes' not in headwords[hw]:
                            headwords[hw]['prefixes']={}
                        headwords[hw]['prefixes'][prefixindex]= prefixlist
                        prefixindex += 1 ; prefixlist = []
                elif REdict.get('TorE'):
                    if 'examples' not in headwords[hw]:
                        headwords[hw]['examples']={}
                    if REdict['seq'] not in headwords[hw]['examples']:
                        headwords[hw]['examples'][REdict['seq']]={}
                    headwords[hw]['examples'][REdict['seq']][REdict['TorE']]= REdict['example'].strip()

                print '***********************************************'
            else:
                #hw='' # ; headwords = {}
                if prefixlist:
                    if 'prefixes' not in headwords[hw]:
                        headwords[hw]['prefixes']={}
                    headwords[hw]['prefixes'][prefixindex]= prefixlist
                    prefixlist=[]
                prefixindex=1
                print '+++++++++++++++++++++++++++++++++++++++++++++++'
                # raw_input()
    #print headwords

    fileObj.close()

    file = open(fn, 'w')
    pickle.dump((headwords), file)
    file.close()
else:
    file = open(fn,'r')
    headwords = pickle.load(file)
    file.close()
#print headwords
output=u''

posdict={
    'noun': ['\n===Noun===\n','tl-noun'],
    'propernoun': ['\n===Proper noun===\n','infl|tl|proper noun'],
    'adj':   ['\n===Adjective===\n','tl-adj'],
    'verbforms': ['\n===Verb===\n', 'infl|tl|verb'],
}
count=1
for hw in sorted(headwords):
    #if count>500: break
    #output +='---[http://en.wiktionary.org/w/index.php?title='  + hw + '&action=edit ' + hw + ']---\n'
    count+=1 ; print count, hw

    #print "headwords['"+hw+"']:",
    #print headwords[hw]
    output += '==Tagalog==\n'
    for pos in posdict:
        if pos in headwords[hw]:
            pos2=pos
            if pos == 'noun' and headwords[hw][pos][:1].isupper(): pos2='propernoun'
            output += posdict[pos2][0]
            output += "{{" + posdict[pos2][1] + "}}\n"
            output += "\n"
            if pos != 'verbforms':
                output += '# '
                synonyms=[]
                words = headwords[hw][pos].split('>')[0].strip()
                if '>' in headwords[hw][pos]:
                    for syn in headwords[hw][pos].split('>')[1].split(','):
                        synonyms.append(syn.strip())
                for w in words.split(','):
                    output += "[[" + w.strip() + "]], "
                if output.endswith(', '): output = output[:-2]
                output += "\n"
            else:
                prev='' ; line = ''
                #print headwords[hw]['verbforms']
                if 'translations' in headwords[hw]['verbforms']:
                    for ix in headwords[hw]['verbforms']['translations']:
                        for w in headwords[hw]['verbforms']['translations'][ix].split():
                            cur=w.strip()
                            if prev == 'to':
                                line+='[[' + cur + ']] '
                                line=re.sub(ur'''(,\]\])''',']],', line)
                            else:
                                line+= cur + ' '
                            prev=cur
                        output += '# ' + line[:-1] + '\n' ; line = ''

    if 'examples' in headwords[hw]:
        for i in sorted(headwords[hw]['examples']):
            if 'T' in headwords[hw]['examples'][i]:
                output+='#:'+headwords[hw]['examples'][i]['T']+'\n'
            if 'E' in headwords[hw]['examples'][i]:
                output+='#::'+headwords[hw]['examples'][i]['E']+'\n'
        output += '\n'
    if 'verbforms' in headwords[hw] and pos=='verbforms':
        output +='\n====Conjugation====\n'
        output += "{{tl-infl"
        i=1
        for ix in headwords[hw]['prefixes']:
            for verbform in headwords[hw]['prefixes'][ix]:
                if verbform != 'translations':
                    output += '|row' + str(i) + '=' + verbform
                    tenses=['inf','pas','pre','fut']
                    index=0
                    #print headwords[hw]['verbforms'][verbform]
                    for j in headwords[hw]['verbforms'][verbform].split(','):
                        #print j, index, tenses[index]
                        o=j.strip()
                        if o: output  += '|' + tenses[index] + str(i) + '='+ o
                        index += 1
                i+=1
        output += "}}\n"

    if synonyms:
        output += '\n====Synonyms====\n* '
        for s in synonyms:
            output += "[[" + s.strip() + "]], "
        output = output[:-2] + "\n"
    if 'related' in headwords[hw]:
        output += '\n===Related terms===\n* '
        for rel in headwords[hw]['related']:
            output += '[[' + rel + ']], '
        output = output[:-2] + '\n'

    workpage=site.Pages[hw]
    pagetext=workpage.edit()
    fileObj = codecs.open("tl-origtempfile", "w", "utf-8" )
    fileObj.write(pagetext)
    fileObj.close()
    if pagetext:
        pagetext += '\n' + output
    else:
        pagetext = output
    fileObj = codecs.open("tl-tempfile", "w", "utf-8" )
    fileObj.write(pagetext)
    fileObj.close()
    os.system('meld tl-origtempfile tl-tempfile')
    fileObj = codecs.open("tl-tempfile", "r", "utf-8" )
    pagetext = fileObj.read()
    fileObj.close()
    print pagetext
    print 's or w: write to Wiktionary\n     d: remove from to-do list (not processing entry)\n     q: quit (after updating to-do list, not processing last entry)'
    inp=''
    while inp not in ['w','s','d','q']:
        inp=raw_input()
    if inp in ('w', 's') and pagetext.strip():
        workpage.save(text=pagetext,
                                 summary = u'Semi-automated creation of Tagalog entries')
    elif inp=='q':
        file = open(fn, 'w')
        pickle.dump((headwords), file)
        file.close()
        exit()
    del headwords[hw]
    output=''