#!/usr/bin/perl use strict; use warnings; use DBI; use DBD::mysql; use Encode; use Getopt::Long; use HTML::Parser; use HTML::Tagset; use Text::Wrap; my $extra; # extra bit of randomization my $max = 2; my $dbhost = 'localhost'; my $dbname = 'wordpress'; my $dbuser; my $dbpasswd; my $num = 1; my $verbose; &GetOptions('user|u=s' => \$dbuser, 'password|p=s' => \$dbpasswd, 'dbhost|h=s' => \$dbhost, 'dbname|n=s' => \$dbname, 'max|m=i' => \$max, 'extra!' => \$extra, 'num|n=i' => \$num, 'verbose|v+' => \$verbose); my %words; my $nwords; my %tuples; my $dbh = DBI->connect(sprintf('DBI:mysql:database=%s;host=%s',, $dbname, $dbhost), $dbuser, $dbpasswd, { RaiseError => 1 }); my $sth = $dbh->prepare(q[SELECT post_content FROM wp_posts WHERE post_status = 'publish']); $sth->execute(); while (defined(my $row = $sth->fetch())) { my $plain_content = ''; my $parser = new HTML::Parser(api_version => 3); $parser->handler(start => sub { my $tag = shift; if ((!$HTML::Tagset::isPhraseMarkup{$tag}) || (grep { $_ eq $tag } qw(br embed))) { $plain_content .= ' '; } }, 'tagname'); $parser->handler(end => sub { my $tag = shift; if ((!$HTML::Tagset::isPhraseMarkup{$tag}) || (grep { $_ eq $tag } qw(br embed))) { $plain_content .= ' '; } }, 'tagname'); $parser->handler(text => sub { $plain_content .= $_[0] }, 'dtext'); $parser->parse($row->[0]); $parser->eof(); my @latest = (''); while ($plain_content =~ /([-\w'\x{2019}]+|[,.?!](?!\w))/g) { my $word = lc($1); ++$words{$word} if $verbose; ++$nwords; &add_tuples($word, @latest); push(@latest, $word); shift @latest if (@latest > $max); } &add_tuples('', @latest); } $sth->finish(); $dbh->disconnect(); if ($verbose) { printf("%d words\n", scalar keys %words); foreach my $word (sort { ($words{$b} <=> $words{$a}) || ($a cmp $b) } keys %words) { printf(" %s: %d (%.2f%%)\n", $word, $words{$word}, 100 * $words{$word} / $nwords); } print "\n"; } while ($num-- > 0) { my $i = 0; my @output = (''); while (1) { my $next; for (my $i = $max; $i >= 0; --$i) { next if ($#output < $i); my $tuple = join(' ', @output[$#output - $i .. $#output]); my $choices = $tuples{$tuple}; next unless defined($choices); next if ($extra && ($i > 0) && (int(rand($i + 1)) == 0)); my $n = 0; my @choices; foreach my $choice (keys %$choices) { push(@choices, [$choice, $choices->{$choice}]); $n += $choices->{$choice}; } my $r = int(rand($n)); while (1) { my($choice, $m) = @{$choices[0]}; if ($r < $m) { $next = $choice; last; } $r -= $m; shift @choices; } last if defined($next); } if ($next eq '') { shift @output; @output = map { &encode('us-ascii', $_) } @output; print &wrap('', '', @output); print "\n\n"; last; } push(@output, $next); } } sub add_tuples { my($word, @latest) = @_; for (my $i = 0; $i < @latest; ++$i) { my $tuple = join(' ', @latest[$#latest - $i .. $#latest]); $tuples{$tuple} ||= {}; ++$tuples{$tuple}->{$word}; } }