#!/usr/bin/perl
require 5.008;
our $VERSION = "0.03"; # 2009-09-08

use strict;
use warnings;

use utf8;
use Encode;
use Encode::Guess;
use Pod::Usage;
use Getopt::Long;

our $UNDEF_UNI = 0xfffd;

our $REVERSE;
our $LANG = "japanese";
our $OUTPUT;
our $INPUT = "<&STDIN";
our %GUESS = (
	      'japanese' => [qw/euc-jp shiftjis 7bit-jis utf8/],
	      'chinese' => [qw/euc-cn big5-eten hz utf8/],
	      'korean' => [qw/euc-kr iso-2022-kr utf8/],
	      'latin-1' => [qw/utf8 iso-8859-1/],
	     );
our $DEFAULT_ENCODING = "utf8";

$ENV{"PERLDOC"} = "" if ! exists $ENV{"PERLDOC"};
$ENV{"PERLDOC"} .= " " if $ENV{"PERLDOC"} ne "";
$ENV{"PERLDOC"} .= "-wcenter:'native2ascii Command Manual'";

=pod

=head1	NAME

native2ascii - convert language-specific-encoded file to be Java-acceptable.

=head1	SYNOPSIS

B<native2ascii> [--reverse] [--encoding ENCODING] [-o outputfile] [inputfile]

B<native2ascii> [-reverse] [-encoding ENCODING] [inputfile] [outputfile]

=head1	Options

=over 8

=item B<--help>

show help message about options.

=item B<--man>

show man page.

=item B<--version>

show version infomation.

=item B<-J>

guess encoding of Japanese.

=item B<--encoding> F<encoding>

specify the encoding.

=back

=head1	DESCRIPTION

B<This program> will read given an input encoded file and output an 
ascii-encoded file to be acceptable by Java compiler.

=head1	AUTHORS

JRF E<lt>http://jrf.cocolog-nifty.com/softwareE<gt>

=head1	COPYRIGHT

Copyright 2006 by JRF L<http://jrf.cocolog-nifty.com/software/>

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

See L<http://www.perl.com/perl/misc/Artistic.html>

=head1	SEE ALSO

L<Encode>, C<iconv(1)>, C<nkf(1)>, C<ack(1)>, C<mg(1)>

=cut

Getopt::Long::Configure("bundling", "auto_version");
GetOptions(
	   "o|output=s"  => \$OUTPUT,
	   "r|reverse" => \$REVERSE,
	   "e|encoding=s" => \$LANG,
	   "C" => sub {$LANG = "chinese"},
	   "J" => sub {$LANG = "japanese"},
	   "K" => sub {$LANG = "korean"},
	   "iso-8859|L:1" => sub {$LANG = "latin-$_[1]"},
	   "man" => sub {pod2usage(-verbose => 2)},
	   "h|?" => sub {pod2usage(-verbose => 0, -output=>\*STDOUT, 
				   -exitval => 1)},
	   "help" => sub {pod2usage(1)},
	   ) or pod2usage(-verbose => 0);

if (@ARGV > 0) {
  $INPUT = shift @ARGV;
}

if (@ARGV > 0) {
  pod2usage(-verbose => 0) if defined $OUTPUT;
  $OUTPUT = shift @ARGV;
}

pod2usage(-verbose => 0) if @ARGV > 0;

if ($REVERSE && ! find_encoding($LANG)) {
  $LANG = $DEFAULT_ENCODING;
}

MAIN: {
  open(INPUT, $INPUT) or die "Cannot open file: $INPUT\n";
  my $text = join("", <INPUT>);
  close(INPUT);
  if ($REVERSE) {
    $text =~ s(\\u([Dd][89ABab][0-9A-Fa-f]{2})
	       \\u([Dd][C-Fc-f][0-9A-Fa-f]{2})){
      my $pre = $`;
      my $orig = $&;
      my $c = $1;
      my $l = $2;
      if ($pre =~ /([^\\]|\A)\\(?:\\\\)*$/) {
	$orig;
      } else {
	chr(((hex($c) - 0xd800) << 10) + (hex($l) + 0x2400));
      }
    }gsex;
    $text =~ s(\\(u+)([0-9A-Fa-f]{1,4})){
      my $pre = $`;
      my $orig = $&;
      my $u = $1;
      my $c = $2;
      if ($pre =~ /([^\\]|\A)\\(?:\\\\)*$/) {
	$orig;
      } else {
	if (length($u) > 1) {
	  "\\" . ("u" x (length($u) - 1)) . $c;
	} else {
	  chr(hex($c));
	}
      }
    }gsex;
    $text =~ s(\\(U+)([0-9A-Fa-f]{1,8})){
      my $pre = $`;
      my $orig = $&;
      my $u = $1;
      my $c = $2;
      if ($pre =~ /([^\\]|\A)\\(?:\\\\)*$/) {
	$orig;
      } else {
	if (length($u) > 1) {
	  "\\" . ("U" x (length($u) - 1)) . $c;
	} else {
	  chr(hex($c));
	}
      }
    }gsex;
    $text = encode($LANG, $text);
  } else {
    if (exists $GUESS{$LANG}) {
      my $guess = $GUESS{$LANG};
      my $enc = guess_encoding($text, @$guess);
      if (defined $enc && ! ref $enc) {
	($enc) =  grep {my $a = $_; grep {$_ eq $a} (split $enc)}
	  @$guess;
	$enc = find_encoding($enc);
      }
      if (!defined $enc) {
	$enc = find_encoding($DEFAULT_ENCODING);
      }
      $text = $enc->decode($text);
    } else {
      $text = decode($LANG, $text);
    }
    $text =~ s(\\(u+|U+)([0-9A-Fa-f]{1,4})){
      my $pre = $`;
      my $orig = $&;
      my $u = $1;
      my $c = $2;
      if ($pre =~ /([^\\]|\A)\\(?:\\\\)*$/) {
	$orig;
      } else {
	"\\" . ($u . substr($u, 0, 1)) . $c;
      }
    }gsex;
    $text =~ s([^\x00-\x7f]) {
      my $c = ord($&);
      if ($c < 0x10000) {
	sprintf("\\u%04x", $c);
      } elsif ($c < 0x110000) {
	$c -= 0x10000;
	sprintf("\\u%04x\\u%04x",
		($c >> 10) + 0xd800, ($c & 0x3ff) + 0xdc00);
      } else {
	sprintf("\\u%04x", $UNDEF_UNI);
      }
    }gsex;
  }
  if (defined $OUTPUT) {
    open(OUTPUT, ">", $OUTPUT) or die "Cannot create file: $OUTPUT\n";
  } else {
    open(OUTPUT, ">&STDOUT");
  }
  print OUTPUT $text;
  close(OUTPUT);
}
