#!/usr/bin/perl require 5.008; our $VERSION = "0.02"; # 2006-02-28 use strict; use warnings; use utf8; use Encode; use Encode::Guess; use Getopt::Long; our $UNDEF_UNI = 0xfffd; our $REVERSE; our $LANG = "japanese"; our $OUTPUT; our $INPUT = "<&STDIN"; our %GUESS = ( 'japanese' => [qw/euc-jp shiftjis 7bit-jis utf8/], 'chinese' => [qw/euc-cn big5-eten hz utf8/], 'korean' => [qw/euc-kr iso-2022-kr utf8/], 'latin-1' => [qw/utf8 iso-8859-1/], ); our $DEFAULT_ENCODING = "utf8"; $ENV{"PERLDOC"} = "" if ! exists $ENV{"PERLDOC"}; $ENV{"PERLDOC"} .= " " if $ENV{"PERLDOC"} ne ""; $ENV{"PERLDOC"} .= "-wcenter:'native2ascii Command Manual'"; =pod =head1 NAME native2ascii - convert language-specific-encoded file to be Java-acceptable. =head1 SYNOPSIS B [--reverse] [--encoding ENCODING] [-o outputfile] [inputfile] B [-reverse] [-encoding ENCODING] [inputfile] [outputfile] =head1 Options =over 8 =item B<--help> show help message about options. =item B<--man> show man page. =item B<--version> show version infomation. =item B<-J> guess encoding of Japanese. =item B<--encoding> F specify the encoding. =back =head1 DESCRIPTION B will read given an input encoded file and output an ascii-encoded file to be acceptable by Java compiler. =head1 AUTHORS JRF Ehttp://jrf.cocolog-nifty.com/softwareE =head1 COPYRIGHT Copyright 2006 by JRF L This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See L =head1 SEE ALSO L, C, C, C, C =cut Getopt::Long::Configure("bundling", "auto_version"); GetOptions( "o|output=s" => \$OUTPUT, "r|reverse" => \$REVERSE, "e|encoding=s" => \$LANG, "C" => sub {$LANG = "chinese"}, "J" => sub {$LANG = "japanese"}, "K" => sub {$LANG = "korean"}, "iso-8859|L:1" => sub {$LANG = "latin-$_[1]"}, "man" => sub {pod2usage(-verbose => 2)}, "h|?" => sub {pod2usage(-verbose => 0, -output=>\*STDOUT, -exitval => 1)}, "help" => sub {pod2usage(1)}, ) or pod2usage(-verbose => 0); if (@ARGV > 0) { $INPUT = shift @ARGV; } if (@ARGV > 0) { pod2usage(-verbose => 0) if defined $OUTPUT; $OUTPUT = shift @ARGV; } pod2usage(-verbose => 0) if @ARGV > 0; if ($REVERSE && ! find_encoding($LANG)) { $LANG = $DEFAULT_ENCODING; } MAIN: { open(INPUT, $INPUT) or die "Cannot open file: $INPUT\n"; my $text = join("", ); close(INPUT); if ($REVERSE) { $text =~ s(\\u([Dd][89ABab][0-9A-Fa-f]{2}) \\u([Dd][C-Fc-f][0-9A-Fa-f]{2})){ my $pre = $`; my $orig = $&; my $c = $1; my $l = $2; if ($pre =~ /([^\\]|\A)\\(?:\\\\)*$/) { $orig; } else { chr(((hex($c) - 0xd800) << 10) + (hex($l) + 0x2400)); } }gsex; $text =~ s(\\(u+)([0-9A-Fa-f]{1,4})){ my $pre = $`; my $orig = $&; my $u = $1; my $c = $2; if ($pre =~ /([^\\]|\A)\\(?:\\\\)*$/) { $orig; } else { if (length($u) > 1) { "\\" . ("u" x (length($u) - 1)) . $c; } else { chr(hex($c)); } } }gsex; $text =~ s(\\(U+)([0-9A-Fa-f]{1,8})){ my $pre = $`; my $orig = $&; my $u = $1; my $c = $2; if ($pre =~ /([^\\]|\A)\\(?:\\\\)*$/) { $orig; } else { if (length($u) > 1) { "\\" . ("U" x (length($u) - 1)) . $c; } else { chr(hex($c)); } } }gsex; $text = encode($LANG, $text); } else { if (exists $GUESS{$LANG}) { my $guess = $GUESS{$LANG}; my $enc = guess_encoding($text, @$guess); if (defined $enc && ! ref $enc) { ($enc) = grep {my $a = $_; grep {$_ eq $a} (split $enc)} @$guess; $enc = find_encoding($enc); } if (!defined $enc) { $enc = find_encoding($DEFAULT_ENCODING); } $text = $enc->decode($text); } else { $text = decode($LANG, $text); } $text =~ s(\\(u+|U+)([0-9A-Fa-f]{1,4})){ my $pre = $`; my $orig = $&; my $u = $1; my $c = $2; if ($pre =~ /([^\\]|\A)\\(?:\\\\)*$/) { $orig; } else { "\\" . ($u . substr($u, 0, 1)) . $c; } }gsex; $text =~ s([^\x00-\x7f]) { my $c = ord($&); if ($c < 0x10000) { sprintf("\\u%04x", $c); } elsif ($c < 0x110000) { $c -= 0x10000; sprintf("\\u%04x\\u%04x", ($c >> 10) + 0xd800, ($c & 0x3ff) + 0xdc00); } else { sprintf("\\u%04x", $UNDEF_UNI); } }gsex; } if (defined $OUTPUT) { open(OUTPUT, ">", $OUTPUT) or die "Cannot create file: $OUTPUT\n"; } else { open(OUTPUT, ">&STDOUT"); } print OUTPUT $text; close(OUTPUT); }