User:Polyglot/PHPscript1

From Wiktionary, the free dictionary
Jump to navigation Jump to search

<?php /* This script reads words from text files. It builds a wikified string variable with all those translations Then it queries the wiktionary server with the English translation to see if an entry already exists. If so, the existing and the new version are merged together. If not only the new version is retained. Then a new query is performed, telling the server we want to see a preview of our newly proposed string. Then it's up to the user to check and fix the entry and to submit it. It's twice as slow, since it needs to access the server two times, but it saves an enormous amount of typing. */ // Tell browser we are working with unicode header('Content-type: text/html; charset=utf-8', false); function post_it($datastream, $url) { $url = preg_replace("@^http://@i", "", $url); $host = substr($url, 0, strpos($url, "/")); $uri = strstr($url, "/"); $reqbody = ""; foreach($datastream as $key=>$val) { if (!empty($reqbody)) $reqbody.= "&"; $reqbody.= $key."=".urlencode($val); } $contentlength = strlen($reqbody); $reqheader = "POST $uri HTTP/1.1\r\n". "Host: $host\n". "User-Agent: PostIt\r\n". "Content-Type: application/x-www-form-urlencoded; charset=utf-8\r\n". "Content-Length: $contentlength\r\n\r\n". "$reqbody\r\n"; $socket = fsockopen($host, 80, $errno, $errstr); if (!$socket) { $result["errno"] = $errno; $result["errstr"] = $errstr; return $result; } fputs($socket, $reqheader); while (!feof($socket)) { $result[] = fgets($socket, 4096); } fclose($socket); return $result; } // this function is not actually used. function utf2html ($utf2html_string) { $utf2html_retstr = ""; for ($utf2html_p=0; $utf2html_p<strlen($utf2html_string); $utf2html_p++) { $utf2html_c = substr ($utf2html_string, $utf2html_p, 1); $utf2html_c1 = ord ($utf2html_c); if ($utf2html_c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode $utf2html_p++; $utf2html_t = substr ($utf2html_string, $utf2html_p, 1); $utf2html_c2 = ord ($utf2html_t); $utf2html_c1 &= 31; // remove the 3 bit two bytes prefix $utf2html_c2 &= 63; // remove the 2 bit trailing byte prefix $utf2html_c2 |= (($utf2html_c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 $utf2html_c1 >>= 2; // c1 shifts 2 to the right $utf2html_n = dechex($utf2html_c1).dechex($utf2html_c2); $utf2html_retstr .= sprintf ("&#%03d;", hexdec($utf2html_n)); } else{ $utf2html_retstr .= $utf2html_c; } } return $utf2html_retstr; } function ConvertToIPA($spelling) { // First prepend an @ sign to every character in the string, still to be done // foreach( $IPA = join('@', preg_split('//', $spelling)); $search = array ('','',''); $replace = array ('','',''); $IPA = str_replace($search, $replace, $IPA); $search3 = array ("@c@h@r","",""); $replace3 = array ("kɽ", "",""); $IPA = str_replace($search3, $replace3, $IPA); $search = array ( "@e@a","@s@h","@c@h","@c@a", "@c@e","@c@i","@c@o","@c@u","@c@y","@q@u","@l@l","@d@z","@r@r","@p@h","@o@o","@c@k","@e@e","@o@w","@o@u","@n@g","@d@g","","","","","","","","","","","",""); $replace = array ("iɛæ", "ʃ", "ʧk", "keɪæəɑ","siəɛ","saɪə","kɔo", "kʌʊju","sɪiə","kw", "l", "ʤ", "ɽ", "f", "ʊ", "k", "i", "ɑʊ", "ɑʊ", "ŋ","ʤ",""); $IPA = str_replace($search, $replace, $IPA); $search = array ("@a", "@b","@c","@d","@e", "@f","@g","@h","@i","@j","@k","@l","@m","@n","@o","@p","@q","@r","@s","@t","@u", "@v","@w","@x","@y","@z","@"); $replace = array ("eɪæəɑ","b", "sk","d", "iəɛ","f", "g", "h", "aɪə","ʤ", "k", "l", "m", "n", "ɔo","p", "k", "ɽ", "s", "t", "ʌʊju","v","w", "ks","jɪ","zʒ",""); // "ɶɜɾɐəɑβçðɛɱɣɥɪɲKʎMŋɔPɒʁʃθʊʌWχʏʒʙɴ̃ʀħɒɖʤɦʝɳɸɽʂʧʦʐæʉ"; $IPA = str_replace($search, $replace, $IPA); return $IPA; } // *********************************************************************** // Read in all the text files and inventorize the languages they contain // *********************************************************************** $magusfile[1]="./magus/slavic1.txt"; $magusfile[2]="./magus/slavic2.txt"; $magusfile[3]="./magus/slavic3.txt"; $magusfile[4]="./magus/germanic.txt"; $magusfile[5]="./magus/romance1.txt"; $magusfile[6]="./magus/romance2.txt"; $magusfile[7]="./magus/baltic&ural-altaic.txt"; $magusfile[8]="./magus/celtic&basque.txt"; $magusfile[9]="./magus/other.txt"; foreach($magusfile as $key=>$currentword) { $maguscontents = file($currentword); $magustext[$key] = implode("<br>",$maguscontents); $maguslanguages[$key] = $maguscontents[0]; // read first line // echo $maguslanguages[$key]."<br>"; $languages = explode ("\t", $maguslanguages[$key]); } // *********************************************************************** // What word are we processing? // *********************************************************************** $indexfile="./magus/index.txt"; $indexcontents = file($indexfile); //foreach($indexcontents as $key=>$current) { $key=0; $current=$indexcontents[0]; $currentword=trim($current); //echo $key.": "; //echo $currentword."| "; //$textpos = strpos ($magustext[$key], $currentword); //$lineendpos = strpos($magustext[$key], chr(10), $textpos+1); //search end of line char //$textextract = substr ($magustext[$key], $textpos, $lineendpos-$textpos); //$nextchar = substr ($magustext[$key], $lineendpos +1 , 1); //echo $textextract. " |" . ord($nextchar) . "|<br>"; // *********************************************************************** // Find this word in the different text files // *********************************************************************** foreach($magustext as $langkey=>$text) { //echo "text: ".$text."<br>"; $textpos = strpos ($text,$currentword); $lineendpos = strpos ($text,chr(10),$textpos+1); //"<br>" //echo "textpos: ".$textpos." lineendpos: ".$lineendpos."<br>"; $textextract = substr ($text, $textpos, $lineendpos-$textpos); $words = explode ("\t", $textextract); $languages = explode ("\t", $maguslanguages[$langkey]); //exit; foreach($languages as $languagekey=>$language) { $language = trim($language); $terms = explode (",",$words[$languagekey]); $search = array (' (m.)',' (f.)',' (n.)'); $replace = array ("]] ''m''","]] ''f''","]] ''n''"); $terms = str_replace($search, $replace, $terms); $comma = ""; $word[$language] = ""; foreach($terms as $termkey=>$term) { if ($termkey) $comma=", "; $word[$language] .= $comma."[[".trim($term)."]]"; $word[$language] = str_replace("'']]", "''", $word[$language]); } // if ($word[$language]=="") $word[$language]="[[]]"; } $word[English]=str_replace(array(']]', '[['), array('', ''), $word[English]); $IPA=convertToIPA($word[English]); $entry =""; $entry.="=== Pronunciation ===\r\n"; // $entry.="*[[w:SAMPA|SAMPA]]: /'/\r\n"; $entry.="*[[w:IPA|IPA]]: /'".$IPA."/\r\n\r\n"; $entry.="=== Noun ===\r\n\r\n"; $entry.="'''".$word[English]."'''\r\n\r\n"; $entry.="# [[animal]]\r\n\r\n"; $entry.="=== Translations ===\r\n\r\n"; $curlang="Albanian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Basque"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlangc="Belarussian (c)"; $curlangl="Belarussian (l)"; $curlang= "Belarussian"; $entry.="*[[".$curlang."]]: ".$word[$curlangc].""; if (strlen($word[$curlangl])) $entry.=" (".$word[$curlangl].")"; $entry.="\r\n"; $curlang="Breton"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlangc="Bulgarian (c)"; $curlangl="Bulgarian (l)"; $curlang= "Bulgarian"; $entry.="*[[".$curlang."]]: ".$word[$curlangc].""; if (strlen($word[$curlangl])) $entry.=" (".$word[$curlangl].")"; $entry.="\r\n"; $curlang="Catalan"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Croatian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Czech"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Danish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Dutch"; $entry.="*[[".$curlang."]]: ".$word[$curlang]." '' ''\r\n"; $curlang="Esperanto"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Estonian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Faroese"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Finnish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="French"; $entry.="*[[".$curlang."]]: ".$word[$curlang]." '' ''\r\n"; $curlang="Frisian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Friulian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Gallegan"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="German"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang= "Greek"; $curlanga="Greek (a)"; $entry.="*[[".$curlang."]]: ".$word[$curlang].""; if (strlen($word[$curlanga])) $entry.=" (".$word[$curlanga].")"; $entry.=" '' ''\r\n"; $curlang="Hungarian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Icelandic"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Indonesian"; $entry.="*[[".$curlang."]]: [[".$word[$curlang]."]]\r\n"; $curlang="Irish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Italian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]." '' ''\r\n"; $curlang="Japanese"; $entry.="*[[".$curlang."]]: [[".$word[$curlang]."]] ([[]], )\r\n"; $curlang="Ladin"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Latin"; $entry.="*[[".$curlang."]]: ".$word[$curlang]." '' ''\r\n"; $curlang="Latvian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Lithuanian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Lower Sorbian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlangc="Macedonian (c)"; $curlangl="Macedonian (l)"; $curlang= "Macedonian"; $entry.="*[[".$curlang."]]: ".$word[$curlangc].""; if (strlen($word[$curlangl])) $entry.=" (".$word[$curlangl].")"; $entry.="\r\n"; $curlang="Maltese"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Norwegian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Occitan"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Polish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]." '' ''\r\n"; $curlang="Portuguese"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Romanian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Romansh"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Romany"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlangc="Russian (c)"; $curlangl="Russian (l)"; $curlang= "Russian"; $entry.="*[[".$curlang."]]: ".$word[$curlangc].""; if (strlen($word[$curlangl])) $entry.=" (".$word[$curlangl].")"; $entry.=" '' ''\r\n"; $curlang="Sami"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Sardinian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Scottish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlangc="Serbian (c)"; $curlangl="Serbian (l)"; $curlang= "Serbian"; $entry.="*[[".$curlang."]]: ".$word[$curlangc].""; if (strlen($word[$curlangl])) $entry.=" (".$word[$curlangl].")"; $entry.="\r\n"; $curlang="Slovak"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Slovenian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Spanish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]." '' ''\r\n"; $curlang="Swedish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Turkish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlangc="Ukrainian (c)"; $curlangl="Ukrainian (l)"; $curlang= "Ukrainian"; $entry.="*[[".$curlang."]]: ".$word[$curlangc].""; if (strlen($word[$curlangl])) $entry.=" (".$word[$curlangl].")"; $entry.="\r\n"; $curlang="Upper Sorbian"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Welsh"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; $curlang="Yiddish"; $entry.="*[[".$curlang."]]: ".$word[$curlang]."\r\n"; //echo "<br><br><br><br><br><br>".$entry; // $entry=utf8_decode($entry); //echo "<br><br><br>".$entry; // phpinfo(); } //exit; // *********************************************************************** // Build up text area contents // *********************************************************************** // *********************************************************************** // Get existing contents // *********************************************************************** //$file1="http://wiktionary.org/w/wiki.phtml?title=User:Polyglot/testing/test1&action=edit"; echo "word[English]: ".$word[English]; $url1="http://wiktionary.org/w/wiki.phtml?title=".str_replace(" ", "_", $word[English]); $file1=$url1."&action=edit"; $contents1 = file($file1); $text1 = implode($contents1); $textareainit = strpos ($text1, "<textarea"); $textareapos = strpos ($text1, ">", $textareainit); $textareaend = strpos ($text1, "</textarea"); $textareawpTextbox1 = substr ($text1, $textareapos+1, $textareaend - $textareapos - 1); $textareainit = strpos ($text1, '<input type=hidden value="'); $textareapos = strpos ($text1, "name='wpEdittime'", $textareainit); //$textareaend = strpos ($text1, "</textarea"); $textareawpEdittime = substr ($text1, $textareainit+26, $textareapos - $textareainit - 28); // *********************************************************************** // Propose new contents merged with old contents // *********************************************************************** $data["enctype"] = "application/x-www-form-urlencoded; charset=utf-8"; If ($textareawpTextbox1=="Put your text for the new page here.") { $textareawpTextbox1=""; $data["wpSummary"] = "New entry based on http://www.informatika.bf.uni-lj.si/magus.html, added IPA"; } else { $textareawpTextbox1.="\r\n\r\n*********************\r\n\r\n"; $data["wpSummary"] = "merged with translations found at http://www.informatika.bf.uni-lj.si/magus.html, added IPA"; } $data["wpTextbox1"] = $textareawpTextbox1 .$entry; $data["wpMinoredit"] = "0"; // $data["wpSave"] = "Save page"; $data["wpPreview"] = "Show preview"; $data["wpEdittime"] = $textareawpEdittime; $data["action"] = "edit"; $resulta = post_it($data, $url1); if (isset($resulta["errno"])) { $errno = $resulta["errno"]; $errstr = $resulta["errstr"]; echo "<B>Error $errno</B> $errstr"; exit; } else { for($i=17;$i< count($resulta); $i++) { // if ($i==19) echo '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'."\r\n"; echo $resulta[$i]; } } ?>