#!/usr/bin/perl

use PDL;
use strict;
use warnings;

my $metrics = {'file_count' =>  0,
               't_lines'    =>  0,
               'lines'      => [],
               'paragraphs' => [],
               'characters' => [],
               'sentences'  => [],
               'words'      => [],
               'wordlength' => [],
               'unique'     => [],
               'lexicon'    => {},
              };

print "====== 'Text Metric'inator ======\n";
print "Reports on the text metrics of a collection of TXT files while also attempting\n";
print "to take over the tri-state area.\n\n";
if (!defined $ARGV[0] || !-d $ARGV[0])
{
  print "Usage: text_metricinator.pl <collect directory>\n";
}
else
{
  print " * Calculating text metrics from collection...\n";
  &scanDirectory($ARGV[0], $metrics);
  print "   Done\n\n";

  foreach my $word (keys %{$metrics->{'lexicon'}})
  {
    push(@{$metrics->{'word length'}}, length($word));
  }

  print "===== Report =====\n";
  print ' File count: ' . $metrics->{'file_count'} . "\n";
  foreach my $metric ( sort ('characters','lines','words','sentences','unique','word length') )
  {
    print &generateReportLine($metric, $metrics) . "\n";
  }
  print "Complete!\n\n";
}
exit;


## @function
# Given the path to a text file, calculates various text metrics and adds them
# into the ever-growing metric data structure.  Metrics include:
#  number of lines of text (easy)
#  number of paragraphs (lines with no text...)
#  number of sentences
#  number of characters
#  number of words
#  max word length
#  number of unique words encountered this document
sub extractTextMetrics
{
  my ($txt_path, $metrics) = @_;
  print ' - extracting metrics from: ' . $txt_path . "\n";
  my $number_of_lines_of_text = 0;
  my $number_of_characters = 0;
  my $number_of_sentences = 0;
  my $number_of_words = 0;
  my $number_of_paragraphs = 0;
  my $word_lengths = 0;
  my $unique_words = 0;
  open(TXTFIN, '<:utf8', $txt_path) or die('Failed to open file for reading: ' . $txt_path);
  my $line = '';
  while ($line = <TXTFIN>)
  {
    if ($line =~ /^\s*$/)
    {
      $number_of_paragraphs++;
    }
    else
    {
      $number_of_lines_of_text++;
      $number_of_characters += length($line);
      $number_of_sentences += scalar(split(/\./, $line)) - 1; # Ending ".\n"
      # try to sanitize text to only contains 'words'
      my $clean_line = lc($line);
      $clean_line =~ s/[[:punct:]]//g;
      my @words = split(/\s+/, $clean_line);
      foreach my $word (@words)
      {
        $number_of_words++;
        if (!defined($metrics->{'lexicon'}->{$word}))
        {
          $unique_words++;
          $metrics->{'lexicon'}->{$word} = 0;
        }
        $metrics->{'lexicon'}->{$word}++;
      }
    }
  }
  close(TXTFIN);
  # update metrics with entries for the information captured above
  push(@{$metrics->{'characters'}}, $number_of_characters);
  push(@{$metrics->{'lines'}}, $number_of_lines_of_text);
  push(@{$metrics->{'paragraphs'}}, $number_of_paragraphs);
  push(@{$metrics->{'sentences'}}, $number_of_sentences);
  push(@{$metrics->{'unique'}}, $unique_words);
  push(@{$metrics->{'words'}}, $number_of_words);
}
## extractTextMetrics() ##


## @function
#
sub generateReportLine
{
  my ($metric, $metrics) = @_;
  my $label = join " ", map {ucfirst} split / /, $metric;
  my $piddle = pdl @{$metrics->{$metric}};
  my @stats = statsover($piddle);
  return sprintf(' %s: avg: %0.2f, min: %d, med: %d, max: %d, sdev: %0.2f, adev: %0.2f, prms: %0.2f', $label, $stats[0], $stats[3], $stats[2], $stats[4], $stats[6], $stats[5], $stats[1]);
}
##


## @function
#
sub scanDirectory
{
  my ($dir, $metrics) = @_;
  print ' - searching directory: ' . $dir . "\n";
  opendir(DH, $dir);
  my @files = readdir(DH);
  closedir(DH);
  foreach my $file (@files)
  {
    my $path = $dir . '/' . $file;
    # skip dotted files of any type
    if ($file =~ /^\./)
    {
    }
    # recurse directories
    elsif (-d $path)
    {
      &scanDirectory($path, $metrics);
    }
    # process TXT files
    elsif ($file =~ /\.txt$/i)
    {
      &extractTextMetrics($path, $metrics);
      $metrics->{'file_count'}++;
    }
  }
}
## scanDirectory() ##