#!/usr/bin/perl

use strict;
use warnings;

use DBI;
use DBD::mysql;
use Encode;
use Getopt::Long;
use HTML::Parser;
use HTML::Tagset;
use Text::Wrap;

my $extra;                      # extra bit of randomization
my $max = 2;
my $dbhost = 'localhost';
my $dbname = 'wordpress';
my $dbuser;
my $dbpasswd;
my $num = 1;
my $verbose;

&GetOptions('user|u=s'     => \$dbuser,
            'password|p=s' => \$dbpasswd,
            'dbhost|h=s'   => \$dbhost,
            'dbname|n=s'   => \$dbname,
            'max|m=i'      => \$max,
            'extra!'       => \$extra,
            'num|n=i'      => \$num,
            'verbose|v+'   => \$verbose);

my %words;
my $nwords;
my %tuples;

my $dbh = DBI->connect(sprintf('DBI:mysql:database=%s;host=%s',,
                               $dbname, $dbhost),
                       $dbuser, $dbpasswd,
                       { RaiseError => 1 });
my $sth = $dbh->prepare(q[SELECT post_content
                          FROM wp_posts
                          WHERE post_status = 'publish']);
$sth->execute();
while (defined(my $row = $sth->fetch())) {
  my $plain_content = '';
  my $parser = new HTML::Parser(api_version => 3);
  $parser->handler(start =>
                   sub {
                     my $tag = shift;
                     if ((!$HTML::Tagset::isPhraseMarkup{$tag})
                         || (grep { $_ eq $tag } qw(br embed))) {
                       $plain_content .= ' ';
                     }
                   }, 'tagname');
  $parser->handler(end =>
                   sub {
                     my $tag = shift;
                     if ((!$HTML::Tagset::isPhraseMarkup{$tag})
                         || (grep { $_ eq $tag } qw(br embed))) {
                       $plain_content .= ' ';
                     }
                   }, 'tagname');
  $parser->handler(text => sub { $plain_content .= $_[0] }, 'dtext');
  $parser->parse($row->[0]);
  $parser->eof();
  my @latest = ('');
  while ($plain_content =~ /([-\w'\x{2019}]+|[,.?!](?!\w))/g) {
    my $word = lc($1);
    ++$words{$word} if $verbose;
    ++$nwords;
    &add_tuples($word, @latest);
    push(@latest, $word);
    shift @latest if (@latest > $max);
  }
  &add_tuples('', @latest);
}
$sth->finish();
$dbh->disconnect();

if ($verbose) {
  printf("%d words\n", scalar keys %words);
  foreach my $word (sort { ($words{$b} <=> $words{$a}) || ($a cmp $b) } keys %words) {
    printf("  %s: %d (%.2f%%)\n",
           $word, $words{$word}, 100 * $words{$word} / $nwords);
  }
  print "\n";
}

while ($num-- > 0) {
  my $i = 0;
  my @output = ('');
  while (1) {
    my $next;
    for (my $i = $max; $i >= 0; --$i) {
      next if ($#output < $i);
      my $tuple = join(' ', @output[$#output - $i .. $#output]);
      my $choices = $tuples{$tuple};
      next unless defined($choices);
      next if ($extra && ($i > 0) && (int(rand($i + 1)) == 0));
      my $n = 0;
      my @choices;
      foreach my $choice (keys %$choices) {
        push(@choices, [$choice, $choices->{$choice}]);
        $n += $choices->{$choice};
      }
      my $r = int(rand($n));
      while (1) {
        my($choice, $m) = @{$choices[0]};
        if ($r < $m) {
          $next = $choice;
          last;
        }
        $r -= $m;
        shift @choices;
      }
      last if defined($next);
    }
    if ($next eq '') {
      shift @output;
      @output = map { &encode('us-ascii', $_) } @output;
      print &wrap('', '', @output);
      print "\n\n";
      last;
    }
    push(@output, $next);
  }
}

sub add_tuples {
  my($word, @latest) = @_;

  for (my $i = 0; $i < @latest; ++$i) {
    my $tuple = join(' ', @latest[$#latest - $i .. $#latest]);
    $tuples{$tuple} ||= {};
    ++$tuples{$tuple}->{$word};
  }
}
