#!/usr/bin/perl -w use strict; my($help) = "$0:\nA program to do n-letter frequency analysis on a specified input,\n". "and generate pseudo-words based on the observed frequency.\nFlags " . "are (with defaults given in parenthesis):\n\t" . "-h: print this help\n\t" . "-q: suppress all output except final results\n\t" . "-l: translate text to lower-case first\n\t" . "-g: granularity [word or character] (defaults to character)\n\t" . "-n: number of blocks to generate (100)\n\t" . "-p: number of blocks to use to predict next block (2)\n\t" . "-c: number of grains per block (2)\n\t". "-i: input from (stdin)\n\t" . "-o: output to (stdout)\n"; die $help if (!@ARGV); my($quiet) = 0; my($lowercase) = 0; my($numgen) = 100; my($blocks) = 2; my($width) = 2; my($input) = '-'; my($output) = '-'; my($gran) = 'c'; my($i, $j) = 0; for ($i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] eq '-q') { $quiet = 1; } elsif ($ARGV[$i] eq '-l') { $lowercase = 1; } elsif ($ARGV[$i] eq '-n' && $i < $#ARGV) { $numgen = int($ARGV[++$i]); ($numgen > 0) || die "Number of blocks to output must be at least 1!\n"; } elsif ($ARGV[$i] eq '-p' && $i < $#ARGV) { $blocks = int($ARGV[++$i]); ($blocks > -1) || die "Number of prediction blocks must be non-negative!\n"; } elsif ($ARGV[$i] eq '-c' && $i < $#ARGV) { $width = int($ARGV[++$i]); ($width > 0) || die "Characters per block must be at least 1!\n"; } elsif ($ARGV[$i] eq '-g' && $i < $#ARGV) { $gran = lc(substr($ARGV[++$i], 0, 1)); if ($gran ne 'c' && $gran ne 'w') { die "Granularity must be either w(ord) or c(haracter).\n"; } } elsif ($ARGV[$i] eq '-i' && $i < $#ARGV) { $input = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-o' && $i < $#ARGV) { $output = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '-help') { die $help; } else { die $help; } } if (!$quiet) { print "grain type: "; if ($gran eq 'c') { print "character\n"; } else { print "word\n"; } print "grains per block: $width\n"; print "# blocks to predict with: $blocks\n"; print "# blocks to generate: $numgen\n"; print "input file: $input\n"; print "output file: $output\n"; } open(INPUT, "<$input") || die "Can't open $input; $!\n"; if (!$quiet) { print "Reading in from '$input' ...\n"; } undef $/; my($text) = ; close INPUT; open(OUTPUT, ">$output") || die "Can't open $output; $!\n"; if (!$quiet) { print "Substituting ...\n"; } $text =~ s/[\n\t\r ]+/ /gs; $text =~ s/^ //s; $text =~ s/ $//s; if ($lowercase) { $text = lc $text; } my @grainList; my $spacing = ""; # spacing between grains: either a space or nothing if ($gran eq 'c') { @grainList = split(//, $text); $spacing = ""; } else { @grainList = split(/[^a-zA-Z0-9'_-]+/, $text); $spacing = " "; } undef $text; # maybe this'll free up some space, I dunno. die "Input text too short!" if ($#grainList < ($blocks + 1)*$width); if (!$quiet) { print "Scanning in characters ...\n"; } # @predictGrains is our "working set" of grains, while @curGrains is # the ones the working set will predict my @predictGrains; my @curGrains; # prepare by moving the first group of grains over foreach (0..($blocks*$width)-1) { push @predictGrains, shift @grainList; } foreach (0..$width-1) { push @curGrains, shift @grainList; } my %probabilities; # ok, now that we've prepared the initial, repeatedly: # - store the probability for the current @predictGrains and @curGrains # - shift a grain from @grainList to @curGrains # - shift a grain from @curGrains to @predictGrains # - shift a grain from @predictGrains to nil # (until @grainList is empty) while (@grainList) { &storeProb(\%probabilities, \@predictGrains, \@curGrains, $width, $spacing); push(@curGrains, shift @grainList); push(@predictGrains, shift @curGrains); shift @predictGrains; } if (!$quiet) { print "Normalizing probabilities ...\n"; } &normalizeProbs(\%probabilities, $blocks); if (!$quiet) { print "Generating text ...\n"; } my($r); srand(time ^ $$ ^ ($$<<15)); my @seed = &generateSeed(\%probabilities, $blocks); if (!$quiet) { print "Starting seed: ", join(($spacing . " "), @seed), "\n"; print "\n\n"; } my $results = ""; my $tok; # Now just go ahead and predict from the seed, then shove the predicted # block onto the end to add to the new seed. foreach my $i (1..$numgen-$blocks+1) { $tok = &makePrediction(\%probabilities, @seed); $results .= $tok . $spacing; push @seed, $tok; shift @seed; } # finally, print out the results # please remember to wordwrap thankyou. my ($start, $index, $splitPoint) = (0, 0, 75); while (length($results) > $splitPoint) { $index = rindex($results, ' ', $splitPoint); $start = substr($results, 0, $index); $results = substr($results, $index+1); print OUTPUT $start, "\n"; } print OUTPUT "\n"; ############## # Subroutines ############## sub storeProb { my($probHash, $predictList, $curList, $width, $spacing) = @_; my($i, $key); # ok, now we need to wander down the hashes using the blocks as keys for ($i = 0; $i <= $#{$predictList}; $i += $width) { $key = ""; foreach ($i..$i+$width-2) { $key .= $predictList->[$_] . $spacing; } $key .= $predictList->[$i+$width-1]; # step deeper into the hash on this key, making a new hash # if necessary. $probHash->{$key} = +{} unless defined $probHash->{$key}; $probHash = $probHash->{$key}; } $key = ""; foreach (0..$width-2) { $key .= $curList->[$_] . $spacing; } $key .= $curList->[$width-1]; # increase the value stored there or set it to 1 $probHash->{$key} = 0 unless defined $probHash->{$key}; $probHash->{$key}++; } sub normalizeProbs { my ($probHash, $blocks) = @_; my (@list1, @list2); push(@list1, $probHash); foreach (0..$blocks-1) { foreach my $ref (@list1) { push(@list2, values(%{$ref})) } @list1 = @list2; # swap values $#list2 = -1; # clear list2 } # at this point @list1 should be a list of all the bottommost hashes in # the tree my $sum = 0; foreach my $ref (@list1) { $sum = 0; foreach my $i (values %{$ref}) { $sum += $i; } foreach my $i (keys %{$ref}) { # normalize each prob based on $sum $ref->{$i} = int ($ref->{$i} * 10000 / $sum); } } } sub generateSeed { my ($probHash, $blocks) = @_; my @ret; my($i, @tags); # ok, now we need to wander down the hashes using the blocks as keys foreach (0..$blocks-1) { @tags = keys(%{$probHash}); $i = rand @tags; push @ret, $tags[$i]; $probHash = $probHash->{ $tags[$i] }; } return @ret; } sub makePrediction { my($probHash, @seed) = @_; # ok, now we need to wander down the hashes using the blocks as keys foreach my $key (@seed) { if (defined( $probHash->{$key} )) { $probHash = $probHash->{$key}; } else { # we haven't encounted this situation before. hmm # I know, let's pick randomly! yay randomness! my @tags = keys(%{$probHash}); $probHash = $probHash->{$tags[rand @tags]}; } } # we are now at the base hash. # randomly select a result based on their weightings. my $choice = int rand(9999); my @keylist = keys(%{$probHash}); foreach my $key (@keylist) { $choice -= $probHash->{$key}; return $key if ($choice <= 0); } # hmm, got here if (!$quiet) { print "Finished with choice = $choice?\n"; } return $keylist[0]; }