#!/usr/bin/perl # # ***** BEGIN LICENSE BLOCK ***** # Version: MPL 1.1/GPL 2.0/LGPL 2.1 # # The contents of this file are subject to the Mozilla Public License Version # 1.1 (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License # for the specific language governing rights and limitations under the # License. # # The Original Code is mozilla.org code. # # The Initial Developer of the Original Code is # Netscape Communications Corporation. # Portions created by the Initial Developer are Copyright (C) 1999 # the Initial Developer. All Rights Reserved. # # Contributor(s): # # Alternatively, the contents of this file may be used under the terms of # either the GNU General Public License Version 2 or later (the "GPL"), or # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), # in which case the provisions of the GPL or the LGPL are applicable instead # of those above. If you wish to allow use of your version of this file only # under the terms of either the GPL or the LGPL, and not to allow others to # use your version of this file under the terms of the MPL, indicate your # decision by deleting the provisions above and replace them with the notice # and other provisions required by the GPL or the LGPL. If you do not delete # the provisions above, a recipient may use your version of this file under # the terms of any one of the MPL, the GPL or the LGPL. # # ***** END LICENSE BLOCK ***** $header = <//eg; $t = foldcombining($t); return rdecompose( $table{$t}); } return $dec; } sub decompose { my ($removeprefix, $dec) = (@_); $removeprefix .= " "; $dec =~ s/$removeprefix//eg; if($dec eq "0020") { $dec = "\\u0020"; } elsif($dec eq "005C") { $dec = "\\u005C"; } else { $k = "\/"; $dec =~ s/2044/$k/eg; $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg; $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g; $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg; $dec =~ s/ //eg; } return $dec; } ###################################################################### # # Open the unicode database file # ###################################################################### open ( UNICODATA , "< UnicodeData-Latest.txt") || die "cannot find UnicodeData-Latest.txt"; open ( UNICODATA2 , "< UnicodeData-Latest.txt") || die "cannot find UnicodeData-Latest.txt"; ###################################################################### # # Open the output file # ###################################################################### open ( OUT , "> ../tables/transliterate.properties") || die "cannot open output ../tables/transliterate.properties file"; print OUT $header; # remove comments from $handcoded $handcoded =~ s/^#[^#].*\n//mg; print OUT $handcoded; ###################################################################### # # Process the file line by line # ###################################################################### while() { chop; @f = split(/;/ , $_); $udec = hex($u); if(($udec > 256 ) && ($f[5] ne "")) { $table{$f[0]}=$f[5]; } } while() { chop; ###################################################################### # # Get value from fields # ###################################################################### @f = split(/;/ , $_); $u = $f[0]; # The unicode value $cmt = $f[1]; # The comment $dec = $f[5]; # The decomposed value $d1 = $f[6]; $d2 = $f[7]; $d3 = $f[8]; $udec = hex($u); if($udec > 128) { # not ASCII if($dec ne "") { # have decomposition if($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { # ignore non ASCII decomposition # warning($_); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", "(".$dec.")")); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { # warning($_); } elsif($dec =~ //) { output($u,$cmt,$udec,"^(".&decompose("", $dec).")"); } elsif($dec =~ //) { output($u,$cmt,$udec,"v(".&decompose("", $dec).")"); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { if($dec eq " 0020") { output($u,$cmt,$udec,"\\u0020"); } else { # ignore # warning($_); } } else { warning($_); } } else { # decomposition without format code if($cmt =~ /LATIN/) { $dec = foldcombining($dec); output($u,$cmt,$udec,&decompose("", $dec)); } elsif($cmt =~ /CYRILLIC/) { # ignore # warning($_); } elsif($cmt =~ /GREEK/) { # ignore # warning($_); } elsif($cmt =~ /ARABIC/) { # ignore # warning($_); } elsif($cmt =~ /CJK/) { # ignore # warning($_); } elsif($cmt =~ /HEBREW/) { # ignore # warning($_); } elsif($cmt =~ /DEVANAGARI/) { # ignore # warning($_); } elsif($cmt =~ /BENGALI/) { # ignore # warning($_); } elsif($cmt =~ /GURMUKHI/) { # ignore # warning($_); } elsif($cmt =~ /ORIYA/) { # ignore # warning($_); } elsif($cmt =~ /TAMIL/) { # ignore # warning($_); } elsif($cmt =~ /TELUGU/) { # ignore # warning($_); } elsif($cmt =~ /KANNADA/) { # ignore # warning($_); } elsif($cmt =~ /MALAYALAM/) { # ignore # warning($_); } elsif($cmt =~ /SINHALA/) { # ignore # warning($_); } elsif($cmt =~ /TIBETAN/) { # ignore # warning($_); } elsif($cmt =~ /MYANMAR/) { # ignore # warning($_); } elsif($cmt =~ /KATAKANA/) { # ignore # warning($_); } elsif($cmt =~ /HIRAGANA/) { # ignore # warning($_); } else { # ignore # warning($_); } } } else { # do not have decomposition if ($d1 ne "") { # are numeric characters output($u,$cmt,$udec,$d1); } elsif ($d2 ne "") { if($cmt =~ /CIRCLED/) { # circled output($u,$cmt,$udec,"(".$d2.")"); } else { # others, use [ ] output($u,$cmt,$udec,"[".$d2."]"); } } elsif ($d3 ne "") { if($cmt =~ /CIRCLED/) { # circled output($u,$cmt,$udec,"(".$d3.")"); } else { # others, use [ ] output($u,$cmt,$udec,"[".$d3."]"); } } else { # not numeric characters } # end of no decomposition } # end of have/not decomposition } } ###################################################################### # # Close files # ###################################################################### close(UNIDATA); close(OUT);