#!/usr/bin/perl
#require 5.008;
our $VERSION = "0.02"; #Time-stamp: <2014-07-17T07:32:26Z>

## An example code to sum up TXT generated with
## "Cocolog_Pro_Analyzer.user.js" found at 
## "http://jrf.cocolog-nifty.com/software/2014/03/post-3.html".

## JSON の取得がうまくいかないときは、手作業でコピペしたテキストを用意
## するしかない。タイトルのところで改行が入るので、二行ずつで一データの
## テキストを得てそれを処理する。

use strict;
use warnings;
use utf8; # Japanese

use POSIX qw(ceil);
use HTTP::Request::Common;
use LWP::UserAgent;
#require LWP::Protocol::https;

our $TXT;

our $GET_PAGE_TRY = 3;
#our $BASIC_USER_NAME;
#our $BASIC_PASSWORD;

## 私のブログでは、フィギュア写真カテゴリやデスクトップ壁紙カテゴリといっ
## た "FanArt" があり、そこのアクセスを注視していて、そのカウントを特別
## に行う。

our @FanArtCategory =
  qw(
     http://jrf.cocolog-nifty.com/column/cat5351378/index.html
     http://jrf.cocolog-nifty.com/column/cat23589406/index.html
   );

our @FanArtExclude =
  qw(
     http://jrf.cocolog-nifty.com/column/2006/02/post_b8ce.html
     http://jrf.cocolog-nifty.com/column/2013/09/post.html
   );

our @FanArtInclude =
  qw(
      http://jrf.cocolog-nifty.com/column/
   );

our $BASE_REGEXP = qr((?:https?://)?[^/]+\.[^/]+\.[^/]+/)i;


my %FanArt;


sub get_page {
  my ($url) = @_;
  my $ua = LWP::UserAgent->new;
  my $req = GET($url);
  for (my $i = 0; $i < $GET_PAGE_TRY; $i++) {
#    if (defined $BASIC_USER_NAME) {
#      $req->authorization_basic($BASIC_USER_NAME, $BASIC_PASSWORD);
#    }
    my $res = $ua->request($req);
    if (! $res->is_success) {
      warn "Can't read $url .";
      next;
    }
#    print $res->decoded_content;
    return $res->decoded_content;
  }
  die "Can't read $url . TRY = $GET_PAGE_TRY";
}

sub initialize_fan_art_url {
  foreach my $url (@FanArtCategory) {
    my $s = get_page($url);
    while ($s =~ /<a([^>]*\sclass=[\"\']permalink[\"\'][^>]*)>/i) {
      my $attr = $1;
      $s = $';
      if ($s !~ /<\s*\/a\s*>/i) {
	die "Parse Error";
      }
      my $title = $`;
      $s = $';
      if ($attr !~ /href=([\"\'])([^\1]+)\1/i) {
	die "Parse Error";
      }
      my $href = $2;
      $href =~ s/^$BASE_REGEXP//;
#      print "$title\n$href\n\n" if ! exists $FanArt{$href};
      $FanArt{$href} = $title if ! exists $FanArt{$href};
    }
    $url =~ s/\/index.html$/\//;
    $FanArt{$url} = "?" if ! exists $FanArt{$url};
  }
  foreach my $href (@FanArtExclude) {
    $href =~ s/^$BASE_REGEXP//;
    delete $FanArt{$href} if exists $FanArt{$href};
  }
  foreach my $href (@FanArtInclude) {
    $href =~ s/^$BASE_REGEXP//;
    $href =~ s/\/index.html$/\//;
    $FanArt{$href} = "?" if ! exists $FanArt{$href};
  }
}


MAIN:
{
  binmode(STDIN, ":unix:utf8");
  binmode(STDOUT, ":unix:utf8");
  binmode(STDERR, ":unix:utf8");

  if (@ARGV) {
    $TXT = shift(@ARGV);
  }
  if (! defined $TXT || $TXT eq "--help" || $TXT eq "-h") {
    print "usage: $0 ページビューからコピペした.txt\n";
    exit(0);
  }

  open(my $fh, "<", $TXT) or die "$TXT: $!";
  binmode($fh, ":utf8");
  my $s = join("", <$fh>);
  close($fh);

  initialize_fan_art_url(); ## ここをコメントアウトすれば、FanArt関連の
                            ## 処理はなくなる。
  my @data;
  my @l = split(/\n/, $s);
  while (@l) {
    my $l1 = shift(@l);
    my $l2 = shift(@l) if @l;
    if (! defined $l2 || $l2 =~ /^\s*$/s) {
      die "Parse Error.";
    }
    my (undef, $title) = split(/\s+/, $l1, 2);
    my ($url, $check, $pv, $in) = split(/\s+/, $l2);
    if ( ! defined $title || ! defined $in
       || $pv !~ /^[01-9,]+$/ || $in !~ /^[01-9,]+$/) {
      die "Parse Error: '$title' '$url' '$check' '$pv' '$in'.";
    }
    $in =~ s/,//g;
    $pv =~ s/,//g;
    $url =~ s/^[^\/]+\///s;
    push(@data, [$title, $url, $pv, $in]);
  }

  my %blogs;
  my $max_name_len = 0;
  foreach my $l (@data) {
    my ($title, $url, $pv, $in) = @$l;
    die "Parse Error" if $url !~ /^[^\/]+/;
    my $blog = $&;
    $blogs{$blog} = [0, 0] if ! exists $blogs{$blog};
    $blogs{$blog}->[0] += $pv;
    $blogs{$blog}->[1] += $in;
    if (length($blog) > $max_name_len) {
      $max_name_len = length($blog);
    }
  }
  foreach my $l (@data) {
    my ($title, $url, $pv, $in) = @$l;
    if (exists $FanArt{$url}) {
      my $blog = "FanArt";
      $blogs{$blog} = [0, 0] if ! exists $blogs{$blog};
      $blogs{$blog}->[0] += $pv;
      $blogs{$blog}->[1] += $in;
      if (length($blog) > $max_name_len) {
	$max_name_len = length($blog);
      }
    }
  }
  $max_name_len = ceil($max_name_len / 8) * 8;

  foreach my $blog (sort keys %blogs) {
    my ($pv, $in) = @{$blogs{$blog}};
    my $bt = $blog . (" " x ($max_name_len - length($blog)));
    print "$bt\t$pv\t$in\n";
  }

  print "\n";

  my $i = 0;
  foreach my $l (sort {$b->[3] <=> $a->[3]} @data) {
    my ($title, $url, $pv, $in) = @$l;
    $i++;
    print "$i: $title\n$url\n$pv $in\n\n";
    last if $in < 30;  ## 入口回数が 30 未満は切り捨てて上位のみ表示。
  }
}
