User:Visviva/Python/wiktiparse.py

From Wiktionary, the free dictionary
Jump to navigation Jump to search
import urllib2
import re, time
#from nltk import metrics
#import nlp3

#parser=nlp3.Parser()

posses=["noun","verb","adjective","adverb","determiner","article","preposition","conjunction","proper noun","letter","character","phrase","proverb","idiom","symbol","syllable"]
relations=["synonyms","antonyms","hypernyms","hyponyms","meronyms","holonyms","troponyms","related terms","derived terms","coordinate terms"]

class Section:
	def __init__(self,text,level=2,title='entry',parent=False,fancy=True):
		self.subsections={}
		self.is_pos=False
		self.content=''
		self.definitions=[]
		self.notes=[]
		self.paragraphs=[]
		self.defmatchers={}
		self.inflection=''
		self.title=title.lower().replace("_"," ")
		self.parent=parent
		self.categories=[]
		if self.parent:
			self.index=self.parent.index+"_"+self.title
		else:
			self.index=self.title
		self.level=level #level of the current header; 1 for a full all-languages page
		if fancy:
			global parser
			import nlp3
			self.parser=nlp3.Parser()
		else:
			self.parser=False
#		print len(text), self.title,len(self.definitions)
		text=text.replace("\xa1\xb0","").replace("\xa1\xb1","")
		if self.title in posses: self.is_pos=True
#are there any subsections in the section?
		subs=self.split(text,level=level+1)
		if len(subs) >= 2: 
			self.content=subs[''].strip()
			subsections=[Section(x.strip(),level=level+1,title=s.strip(),parent=self,fancy=fancy) for s,x in subs.items() if s]
			self.subsections=dict((x.title,x) for x in subsections)
		else:
			self.content=text.strip()
		self.content=self.content.replace("\n----","")
		self.content=re.sub("[\n\r]+\{\{count page.*?\}\}","",self.content)
		lines=[x.strip() for x in self.content.split("\n") if x.strip()]
		if self.content: # check for categories and interwikis
			main=self.getmain(level=2)
			catlines=[x for x in lines if x.startswith("[[Category:")]
			if catlines:
				for c in catlines:
					main.categories.extend(x.strip() for x in x.split("]]")[0] for x in c.split("[[Category:"))
					if main != self:
						self.content=self.content.replace(c,"")
			interlines=[x for x in lines if re.match("\[\[[a-z][a-z]+\:",x)]
			for i in interlines:
				self.content=self.content.replace(i,"") #for now, just remove
			self.content=self.content.strip()
			
		if self.content and self.title != "translations":
			if self.is_pos:
				self.inflection=lines[0]
			defsplus=[x for x in lines if x.startswith("#")]
			prevdef=False
			for d in defsplus:
				defline=re.match('\#+([^\*\:]+.*)',d)
				if defline: 
					defn=Sense(defline.group(1).strip())
					self.definitions.append(defn)
					prevdef=defn
				elif '#:' in d or '#*' in d:
					if prevdef: 
						prevdef.citations.append(d)
					else: 
						print "Unprocessed data: "+d
			bullets=[x for x in lines if x.startswith("*")]
			prevnote=False
			for b in bullets:
				noteline=re.match('\*([^\#\:\*]+.*)',b)
				if noteline: 
					noteline=noteline.group(1)
					noteline=noteline.strip()
					note=Sense(noteline)
					if noteline.startswith("{{sense"):
						sense=noteline.split("{{sense")[1].split("|")[0]
						rest=noteline.split("}}",1)[1]
						if self.defmatchers:
							self.defmatchers[sense].append(rest)
						else:
							self.defmatchers[sense]=[rest]
					else:
						self.notes.append(note)
					prevnote=note
				elif '*:' in b or '**' in b:
					if prevnote: 
						prevnote.citations.append(b)
					else: 
						print "Unprocessed data: "+b
			self.paragraphs=[x for x in lines if x not in bullets and x not in defsplus]
		if self.content and self.title == "translations":
			content=str(self.content)
			if "{{checktrans" in content:
				if "{{checktrans-top" in content: catchme="checktrans-top"
				else: catchme="checktrans}}"
				checkit=content.split(catchme)[1].strip()
				content=content.split(catchme)[0].strip()
				self.defmatchers['']=checkit
			if "{{trans-top" in self.content:
				chunks=self.content.split("{{trans-top|")[1:]
				self.defmatchers.update(dict(tuple(c.split("}}",1)) for c in chunks if "}}" in c))
			else:
				self.defmatchers['']=self.content
		if fancy: self.process_defmatchers()
		
	def __str__(self):
		return self.title
		
	def split(self,text,level=2): #splits wikitext by section at specified level 
		eqs="=" * level
		splitter="[\n\r]+"+re.escape(eqs)+"\s*([^\=]+?)\s*"+re.escape(eqs)+"[\n\r]+"
		sections=['']+re.split(splitter,"\n"+text)
		processed=dict([(sections[x],sections[x+1]) for x in range(len(sections)) if x%2 == 0])
		return processed
	
	def getmain(self,level=1):
		ancestor=self
		while ancestor.parent and ancestor.level != level:
			ancestor=ancestor.parent
		return ancestor
	
	def process_defmatchers(self):
		if not self.is_pos or not self.definitions:
			for s in self.subsections.values():
				s.process_defmatchers()
			return True
		if self.is_pos and len(self.definitions)==1:
			onlyone=self.definitions[0]
			for s in self.subsections.values():
				if s.title.lower() in relations:
					onlyone.relations[s.title.lower()] = s.notes
					return True
		else:
			onlyone=False
		for s in self.subsections.values():
			if not s.defmatchers or not self.parser:
				if not onlyone: continue
			print "**** %s ****" % s.title				
			matches=parser.matchup3(s.defmatchers.keys(),[str(x) for x in self.definitions])
			if "__unmatched__" in matches.values():
				print "Unable to match: "+str([x for x in matches.keys() if matches[x]=="__unmatched__"])
			for d, v in s.defmatchers.items():
				if not d: continue
				if matches[d]=="__unmatched__": continue
				match=[x for x in self.definitions if str(x) == matches[d]][0]
				match.glosses.append(d)
#				senses=sorted((metrics.edit_distance(str(x),d),x) for x in self.definitions)
#				print senses[0][0],str(senses[0][1]),d
				if s.title != "translations":
					match.relations[s.title]=v
#				senses[0][1].relations[s.title]=v
				
				else: #translations...
					text=str(v)
					for t in text.split("\n"):
						t=t.strip()
						if not t.startswith("*"): continue
						tr=re.match("\*[\*\:\#]*(.+)",t)
						if tr: 
							tr=tr.groups(1).strip()
							if ":" not in tr: print "Unable to process: "+tr
							else:
								match.translations[tr.split(":")[0]]=tr.split(":",1)[1]
#								senses[0][1].translations[tr.split(":")[0]]=tr.split(":",1)[1]
#								print senses[0][1].translations
		
	def all_definitions(self,everything=False,pos=False):
		output=[]
		if not self.is_pos:
			for s in self.subsections.values(): output.extend(s.all_definitions(everything,pos))
		elif not pos or pos.lower() in self.title:
			if everything:
				output.extend((self.title,str(x),x.relations,x.translations,x.glosses) for x in self.definitions)
			else:
				output.extend((self.title,str(x)) for x in self.definitions)
		return output
		
	def all_subsections(self):
		output=self.subsections
		for s in self.subsections.values():
			output.update(s.subsections)
		return output
		
class Sense:
	def __init__(self,content):
		self.content=''
		self.glosses=[]
		self.citations=[]
		self.translations={}
		self.relations={}
		self.content=content.strip()
		self.exists=True
		self.only_templates=False
		self.is_non_gloss=False
		self.form_of=False
		self.has_templates="{{" in self.content and "}}" in self.content
		sans_templates=re.sub("\{\{[^\}]*?\}\}","",self.content) # assuming for now that nested templates in sense lines will be vanishingly rare
		if not sans_templates:
			self.only_templates=True
			if "{{non-gloss definition|" in self.content:
				self.content=re.sub("\{\{non\-gloss definition\|(.*)\}\}","\1",self.content)
				self.is_non_gloss=True
			elif "{{defn" in self.content:
				self.content=""
				self.exists=False
			else:
				self.form_of=self.content.split("}}")[0].split("|")[-1]
		elif " of|" in self.content: 
			self.form_of=self.content.split(" of|")[1].split("}}")[0].strip()
		
	def __str__(self):
		return self.content
		
class Entry:
	def __init__(self,text='',language=False,fancy=True):
		posses=[]
		if text:
			self.entry=Section(text,level=1,fancy=fancy)
			self.languages=[x.title for x in self.entry.subsections.values()]
			if language:
				self.entry=[x for x in self.entry.subsections.values() if x.title == language][0]
			
	
class Diff:
	def __init__(self,text1,text2,title=''):
		self.added=set()
		self.removed=set()
		self.changed=set()
		self.title=title.replace("_"," ")
		self.before=Section(text1,level=1,fancy=False)
		self.after=Section(text2,level=1,fancy=False)
		self.beforesections=iterate_content(self.before,[])
		self.aftersections=iterate_content(self.after,[])
		self.beforeindices=set([x[0] for x in self.beforesections])
		self.afterindices=set([x[0] for x in self.aftersections])
		if self.beforesections == self.aftersections:
			print "No changed sections."
#			print self.beforeindices,self.afterindices
		else:
			self.added=self.afterindices-self.beforeindices
			self.removed=self.beforeindices-self.afterindices
		stillthere=dict([(x[0],x[1]) for x in self.beforesections if x[0] in self.afterindices])
		self.changed=set()
		for s in stillthere: #index
			before=stillthere[s]
			after=[x[1] for x in self.aftersections if x[0] == s] [0] #content
			if before.strip() != after.strip():
				self.changed.add(s)
				
	def __str__(self):
		output=''
		if not self.added and not self.removed and not self.changed:
			return 'No changes to entry "%s"' % self.title
		else:
			output='Changes to entry "%s"' % self.title
		if self.added: 
			output+="\nAdded sections: "+", ".join(self.added)
		if self.removed:
			output+="\nRemoved sections: "+", ".join(self.removed)
		if self.changed:
			output+="\nModified sections: "+", ".join(self.changed)
		return output
	
def iterate_content(section,thelist=[]): #create a flat list of tuples (title,content,subsections)
	content=section.content.strip()
	re.sub("[\s\r\n]+","",content)
	thelist.append((section.index,content,section.subsections.values()))
	for s in section.subsections.values():
		iterate_content(s,thelist)
	return thelist
	
def unescape(text): 
#From code by Fredrik Lundh at http://effbot.org/zone/re-sub.htm#-html
# Licensed to the public domain at http://effbot.org/zone/copyright.htm
# Seems to work better than BeautifulSoup for this purpose
    def fixup(m):
        text = m.group(0)
        if text.startswith("&#"):
            try:
                if text.startswith("&#x"):
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            try:
                text = unichr(name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text
    return re.sub("\&\#?\w+\;", fixup, text)

	
def cycle(cycletime=60,pause=5,depth=10):
	last_timestamp=False
	while True:
		url="http://en.wiktionary.org/w/api.php?action=query&list=recentchanges&format=xml&rclimit="+str(depth)
		if last_timestamp: 
			last_timestamp=str(int(last_timestamp.replace("T","").replace("-","").replace(":","").replace("Z","").split(".")[0])+1)
			url+="&rcend="+last_timestamp
#		print url
		rcpage=unicode(urllib2.urlopen(url).read(),"utf-8","ignore")
		if not rcpage: 
			print "Unable to open page"
			time.sleep(cycletime)
			continue
		changes=re.findall("\<rc (.*?)\/\>",rcpage)[::-1]
		print len(changes)
		if not changes: 
			print "No updates..." 
			time.sleep(cycletime)
			continue
		last_timestamp=changes[-1].split('timestamp="')[1].split('"')[0]
		for c in changes:
			after_id=c.split('revid="')[1].split('"')[0]
			before_id=c.split('old_revid="')[1].split('"')[0]
			title=c.split('title="')[1].split('"')[0]
			if ":" in title: continue
			title=title.replace(" ","_")
			if "&#" in title: title=unescape(title)
			after_url=unicode("http://en.wiktionary.org/w/api.php?action=query&prop=revisions&titles=%s&rvlimit=1&rvstartid=%s&rvprop=comment|content|user&format=xml").encode("utf-8","ignore") % (title,after_id)
			after_url=after_url.encode("utf-8","ignore")
			time.sleep(pause)
			after=False
			while not after:
				try: 
					after=unicode(urllib2.urlopen(after_url).read(),"utf-8","ignore").encode("utf-8","ignore")
				except:
					print "Error in loading %s" % after_url
					time.sleep(pause)
			if before_id and before_id != "0":
				before_url=unicode("http://en.wiktionary.org/w/api.php?action=query&prop=revisions&titles=%s&rvlimit=1&rvstartid=%s&rvprop=comment|content|user&format=xml").encode("utf-8","ignore") % (title,before_id)
				before_url=before_url.encode("utf-8","ignore")
				time.sleep(pause)
				before=False
				while not before:
					try: 
						before=unicode(urllib2.urlopen(before_url).read(),"utf-8","ignore").encode("utf-8","ignore")
					except:
						print "Error in loading %s" % before_url
						time.sleep(pause)
			else:
				before_text=''
				before=''
			if 'missing=""' in before or 'missing=""' in after: #entry has been deleted
				print "Entry %s deleted." % title.encode("utf-8","ignore")
				continue
			try: after_text=after.split('xml:space="preserve">')[1].split("</rev>")[0].strip()
			except IndexError:
				print after
				print after_url
				continue
			if before:
				try: 
					before_text=before.split('xml:space="preserve">')[1].split("</rev>")[0].strip()
				except:
					print before
					print before_url
					continue
			try: 
				summary=after.split('comment="')[1].split('"')[0]
			except IndexError:
				summary=''
			user=unicode(after.split('user="')[1].split('"')[0],"utf-8","ignore")
			diff=Diff(before_text,after_text,title.encode("utf-8","ignore"))
			timestamp=c.split('timestamp="')[1].split('"')[0].encode("utf-8","ignore")
			print "\n\n%s edited %s at %s" % (user.encode("utf-8","ignore"),title.encode("utf-8","ignore"),timestamp)
			print "Summary: %s" % summary
			if not before: print "(New page)"
			print diff #outputs summary when coerced to string
		print "Waiting..."
		time.sleep(cycletime)

def present(section):  #human-readable depiction of entry as parsed
	print "***** %s *****" % section.title
	if section.is_pos: print "POS Section"
	print "Content:\n"+section.content[:100]
	if section.definitions:
		print "Definitions:\n"+"\n".join(str(x) for x in section.definitions)
 	if section.notes:
		print "Notes:\n"+"\n".join(str(x) for x in section.notes)
	if section.defmatchers:
		print "Defmatchers:\n"+"\n".join(str((x,v)) for x,v in section.defmatchers.items())
	if section.subsections:
		print "Subsections: "+", ".join(section.subsections.keys())
		for s in section.subsections:
			present(s)