# split text file into individual words
# - call from pipe

use strict;

# ---------- ---------- ---------- ----------

# open file, get all text
my @theFile = <STDIN>;
my $theFile = join("\n", @theFile);

# convert '' pairs into "
$theFile =~ s/\'\'/\"/g;

# remove punctuation, numbers, strange characters
$theFile =~ s/[\?\;\:\!\,\.\"\(\)\$\%\*\\\/\+\-\=\<\>\[\]\_\`\!\|\{\}0-9~°£—•“”…„]/ /g;

# remove page numbers
$theFile =~ s/-.*?-/ /g;

# optional: remove dash
$theFile =~ s/-/ /g;

# remove isolated ', `, &, -
$theFile =~ s/\s[\'\`\&\-]/ /g;
$theFile =~ s/[\'\`\&\-]\s/ /g;

# split back on whitespace
my @theWords = split(" ", $theFile);

foreach my $theWord (@theWords) {
  # optional: remove words with ^, & in them
  if ($theWord !~ /[\^\&]/) {
    print("$theWord\n");
  }
}


