User:ArielGlenn/scripts/mgxlit.pl

From Wiktionary, the free dictionary
Jump to navigation Jump to search

I use this to transliterate from greek to the transliteration scheme on the Wiktionary:About_Greek page. Sure hope I got all the rules right. It's ot meant to be good perl, just maintainable.

#!/usr/bin/perl

binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");

use utf8;
#use encoding "utf8";

# transcription:

#  β γ γγ γχ  δ ζ θ  κ λ μ μπ   ν ντ ξ  π ρ σ τ φ χ  ψ
#  v g ng nch d z th k l m mb|b n nt ks p r s t f ch ps

#  αά  εέ  ηή  ιίϊΐ  οό  υύϋΰ  ωώ
#  aá  eé  ií  iíïí  oó  yýÿý  oó
##  aá  eé  ií  iíïí  oó  yýyý  oó

##  aá  eé  ií  iíií  oó  yýyý  oó

# αυ     αύ     ευ     εύ     ηυ     ηύ     ου   ού 
# af|av  áf|áv  ef|ev  éf|év  if|iv  íf|ív  ou   oú 

## αυ     αύ     ευ     εύ     ηυ     ηύ     ου   ού οϋ
## af|av  áf|áv  ef|ev  éf|év  if|iv  íf|ív  ou   oú oy

# rules:


sub get_next_cluster {
        my ($index,@letters) = @_;

        if ($index == $#letters) {
	    return(0);
        }
	SWITCH:  {
          # consonants
	    ($letters[$index] =~ /β/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /γ/) && do {
		if ($letters[$index+1] =~ /[κγξχ]/) {
                  return(2,$letters[$index],$letters[$index+1]);            
                }
                else {
 	          return(1,$letters[$index]);
                }
            };
	    ($letters[$index] =~ /δ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /ζ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /θ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /λ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /μ/) && do {
                if ($letters[$index+1] =~ /π/) {
                  return(2,$letters[$index],$letters[$index+1]);            
 	        }
                else {
 	          return(1,$letters[$index]);
                }
            };
	    ($letters[$index] =~ /ν/) && do {
 	        return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ξ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /π/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /ρ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /σ/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ς/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /τ/) && do {
 	        return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /φ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /χ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /ψ/) && do {
	        return(1,$letters[$index]);
	    };
          # vowels
	    ($letters[$index] =~ /α/) && do {
		if ($letters[$index+1] =~ /[υύ]/) {
                  return(2,$letters[$index],$letters[$index+1]);            
 	        }
                else {
 	          return(1,$letters[$index]);
                }
            };
	    ($letters[$index] =~ /ά/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ε/) && do {
		if ($letters[$index+1] =~ /[υύ]/) {
                  return(2,$letters[$index],$letters[$index+1]);            
 	        }
                else {
 	          return(1,$letters[$index]);
                }
            };
	    ($letters[$index] =~ /έ/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /η/) && do {
		if ($letters[$index+1] =~ /[υύ]/) {
                  return(2,$letters[$index],$letters[$index+1]);            
 	        }
                else {
 	          return(1,$letters[$index]);
                }
            };
	    ($letters[$index] =~ /ή/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ι/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ί/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ϊ/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ΐ/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ο/) && do {
		if ($letters[$index+1] =~ /[υύϋ]/) {
                  return(2,$letters[$index],$letters[$index+1]);            
 	        }
                else {
 	          return(1,$letters[$index]);
                }
            };
	    ($letters[$index] =~ /ό/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /υ/) && do {
	        return(1,$letters[$index]);
	    };
	    ($letters[$index] =~ /ύ/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ϋ/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ΰ/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ω/) && do {
                return(1,$letters[$index]);
            };
	    ($letters[$index] =~ /ώ/) && do {
                return(1,$letters[$index]);
            };
          # default
            do {
                return(1,$letters[$index]);
            };
      }
}

sub cluster_to_sym {
    my($count,@cluster) = @_;
    if ($count == 0) {
	return("");
    }    
    if ($count == 1) {
	if ($cluster[0] =~ /β/) { return("v"); }
	if ($cluster[0] =~ /γ/) { return("g"); }
	if ($cluster[0] =~ /δ/) { return("d"); }
	if ($cluster[0] =~ /ζ/) { return("z"); }
	if ($cluster[0] =~ /θ/) { return("th"); }
	if ($cluster[0] =~ /κ/) { return("k"); }
	if ($cluster[0] =~ /λ/) { return("l"); }
	if ($cluster[0] =~ /μ/) { return("m"); }
	if ($cluster[0] =~ /ν/) { return("n"); }
	if ($cluster[0] =~ /ξ/) { return("x"); }
	if ($cluster[0] =~ /π/) { return("p"); }
	if ($cluster[0] =~ /ρ/) { return("r"); }
	if ($cluster[0] =~ /σ/) { return("s"); }
	if ($cluster[0] =~ /ς/) { return("s"); }
	if ($cluster[0] =~ /τ/) { return("t"); }
	if ($cluster[0] =~ /φ/) { return("f"); }
	if ($cluster[0] =~ /χ/) { return("ch"); }
	if ($cluster[0] =~ /ψ/) { return("ps"); }
	if ($cluster[0] =~ /α/) { return("a"); }
	if ($cluster[0] =~ /ά/) { return("á"); }
	if ($cluster[0] =~ /ε/) { return("e"); }
	if ($cluster[0] =~ /έ/) { return("é"); }
	if ($cluster[0] =~ /η/) { return("i"); }
	if ($cluster[0] =~ /ή/) { return("í"); }
	if ($cluster[0] =~ /ι/) { return("i"); }
	if ($cluster[0] =~ /ί/) { return("í"); }
	if ($cluster[0] =~ /ϊ/) { return("ï"); }
	if ($cluster[0] =~ /ΐ/) { return("í"); }
#	if ($cluster[0] =~ /ΐ/) { return("ḯ"); }
	if ($cluster[0] =~ /ο/) { return("o"); }
	if ($cluster[0] =~ /ό/) { return("ó"); }
	if ($cluster[0] =~ /υ/) { return("y"); }
	if ($cluster[0] =~ /ύ/) { return("ý"); }
	if ($cluster[0] =~ /ϋ/) { return("ÿ"); }
	if ($cluster[0] =~ /ΰ/) { return("ý"); }
#	if ($cluster[0] =~ /ΰ/) { return("ý"); }
# this is a problem.  I really want something that doesn't exist
# in the unicode tables, i.e. y with accent and diaeresis.
	if ($cluster[0] =~ /ω/) { return("o"); }
	if ($cluster[0] =~ /ώ/) { return("ó"); }
   }
   if ($count == 2) {
# we change these later if needed.
	if (($cluster[0] =~ /γ/) && ($cluster[1] =~ /γ/)) { return("ng"); }
	if (($cluster[0] =~ /γ/) && ($cluster[1] =~ /κ/)) { return("gk"); }
	if (($cluster[0] =~ /γ/) && ($cluster[1] =~ /ξ/)) { return("gx"); }
	if (($cluster[0] =~ /γ/) && ($cluster[1] =~ /χ/)) { return("nch"); }
	if (($cluster[0] =~ /μ/) && ($cluster[1] =~ /π/)) { return("b"); }

	if (($cluster[0] =~ /α/) && ($cluster[1] =~ /υ/)) { return("av"); }
	if (($cluster[0] =~ /α/) && ($cluster[1] =~ /ύ/)) { return("áv"); }
	if (($cluster[0] =~ /ε/) && ($cluster[1] =~ /υ/)) { return("ev"); }
	if (($cluster[0] =~ /ε/) && ($cluster[1] =~ /ύ/)) { return("év"); }
	if (($cluster[0] =~ /η/) && ($cluster[1] =~ /υ/)) { return("iv"); }
	if (($cluster[0] =~ /η/) && ($cluster[1] =~ /ύ/)) { return("ív"); }
	if (($cluster[0] =~ /ο/) && ($cluster[1] =~ /υ/)) { return("ou"); }
	if (($cluster[0] =~ /ο/) && ($cluster[1] =~ /ύ/)) { return("oú"); }
#	if (($cluster[0] =~ /ο/) && ($cluster[1] =~ /ϋ/)) { return("oy"); }
	if (($cluster[0] =~ /υ/)  && ($cluster[1] =~ /ι/)) { return("yi"); }
	if (($cluster[0] =~ /υ/)  && ($cluster[1] =~ /ί/)) { return("yí"); }
   }
#   return(" ");
   return($cluster[0]);
}

sub modify_from_following {
    my($sym,$index,@letters) = @_;

    if ($index > $#letters) {
        return($sym);
    }
    ($peek_count,@peek_cluster) = get_next_cluster($index,@letters);
    my $sym_next = "";
    if ($peek_count) {
      $sym_next = $peek_cluster[0];
    }

    if ($sym =~ /av/) {
        if (($sym_next =~ /^[θκξπστφχψ\W]/) || ($sym_next =~ /^$/)) { return("af"); }
        else { return($sym); }
    }

    if ($sym =~ /áv/) {
        if (($sym_next =~ /^[θκξπστφχψ\W]/) || ($sym_next =~ /^$/)) { return("áf"); }
        else { return($sym); }
    }

    if ($sym =~ /ev/) {
        if (($sym_next =~ /^[θκξπστφχψ\W]/) || ($sym_next =~ /^$/)) { return("ef"); }
        else { return($sym); }
    }

    if ($sym =~ /év/) {
        if (($sym_next =~ /^[θκξπστφχψ\W]/) || ($sym_next =~ /^$/)) { return("éf"); }
        else { return($sym); }
    }

    if ($sym =~ /iv/) {
        if (($sym_next =~ /^[θκξπστφχψ\W]/) || ($sym_next =~ /^$/)) { return("if"); }
        else { return($sym); }
    }

    if ($sym =~ /ív/) {
        if (($sym_next =~ /^[θκξπστφχψ\W]/) || ($sym_next =~ /^$/)) { return("íf"); }
        else { return($sym); }
    }

    return($sym);
}

sub is_following {
    my ($index,@letters) = @_;

    if ($index >= $#letters) {
        return(0);
    }
    ($peek_count,@peek_cluster) = get_next_cluster($index,@letters);
    return($peek_count);
}

sub b_to_mp {
    my($sym,$sym_preceding,$index,@letters) = @_;

    if ($sym =~ /^$/) { return($sym); }
    if ($sym =~ /b/) {
	if ($sym_preceding =~ /^.+$/) {
	    if (is_following($index,@letters)) {
		return("mp");
	    }
	}
    }
    return($sym);
}

while (<STDIN>) {
    $line = lc($_);
    $newline = "";
    @letters = split(//,$line);
    $precedes = "";
    $index = 0;
    $sym_preceding = "";
    $sym = "";
    $output = "";
    while (($count, @cluster) = get_next_cluster($index,@letters)) {
        if ($count == 0) {
	    last;
        }
      # convert to transliterated value, first approx.
        $sym = cluster_to_sym($count,@cluster);
      # check for following letters
        $sym = modify_from_following($sym,$index+$count,@letters);

      # special case: b or mp
        $sym = b_to_mp($sym,$sym_preceding,$index,@letters);

#        for ($i=0; $i<$count; $i++ ) {
#	    print "$cluster[$i]";
#        }
#        print "($sym)";
#        print " ";

	$output .= $sym;
        $index += $count; 
        $sym_preceding = $sym;
	$sym = "";
    }
    print "$output\n";
}

exit 0;