diff options
11 files changed, 2967 insertions, 0 deletions
diff --git a/spamassassin/fuzzyocr/ b/spamassassin/fuzzyocr/
new file mode 100644
index 0000000..c750384
--- /dev/null
+++ b/spamassassin/fuzzyocr/
@@ -0,0 +1,124 @@
+loadplugin FuzzyOcr /etc/spamassassin/
+body FUZZY_OCR eval:fuzzyocr_check()
+describe FUZZY_OCR Mail contains an image with common spam text inside
+body FUZZY_OCR_WRONG_CTYPE eval:dummy_check()
+describe FUZZY_OCR_WRONG_CTYPE Mail contains an image with wrong content-type set
+body FUZZY_OCR_CORRUPT_IMG eval:dummy_check()
+describe FUZZY_OCR_CORRUPT_IMG Mail contains a corrupted image
+body FUZZY_OCR_KNOWN_HASH eval:dummy_check()
+describe FUZZY_OCR_KNOWN_HASH Mail contains an image with known hash
+priority FUZZY_OCR 900
+########### Plugin Configuration #############
+#### Logging options #####
+# Verbosity level (see manual) Attention: Don't set to 0, but to 0.0 for quiet operation. (Default value: 1)
+#focr_verbose 1
+#focr_verbose 1
+focr_verbose 2
+# Logfile (make sure it is writable by the plugin) (Default value: /etc/mail/spamassassin/FuzzyOcr.log)
+# obsoleted by weasel
+#focr_logfile /var/lib/FuzzyOcr/log
+##### Wordlists #####
+# Here we defined the words to scan for (Default value: /etc/mail/spamassassin/FuzzyOcr.words)
+focr_global_wordlist /etc/spamassassin/FuzzyOcr.words
+# This is the path RELATIVE to the respektive home directory for the personalized list
+# This list is merged with the global word list on execution (Default value: .spamassassin/fuzzyocr.words)
+#focr_personal_wordlist .spamassassin/fuzzyocr.words
+# Set this to 1 if you are running a version < 3.1.4.
+# This will disable a function used in conjunction with animated gifs that isn't available in earlier versions (Default value: 0.0)
+#focr_pre314 0.0
+# These parameters can be used to change other detection settings
+# If you leave these commented out, the defaults will be used.
+# Do not use " " around any parameters!
+##### Location of helper applications (path + binary) (Default values: /usr/bin/<app>) #####
+#focr_bin_giffix /usr/bin/giffix
+#focr_bin_giftext /usr/bin/giftext
+#focr_bin_gifasm /usr/bin/gifasm
+#focr_bin_gifinter /usr/bin/gifinter
+#focr_bin_giftopnm /usr/bin/giftopnm
+#focr_bin_jpegtopnm /usr/bin/jpegtopnm
+#focr_bin_pngtopnm /usr/bin/pngtopnm
+#focr_bin_ppmhist /usr/bin/ppmhist
+#focr_bin_convert /usr/bin/convert
+#focr_bin_identify /usr/bin/identify
+#focr_bin_gocr /usr/bin/gocr
+##### Scansets, comma seperated (Default value: $gocr -i -, $gocr -l 180 -d 2 -i -) #####
+# Each scanset consists of one or more commands which make text out of pnm input.
+# Each scanset is run seperately on the PNM data, results are combined in scoring.
+#focr_scansets $gocr -i -, $gocr -l 180 -d 2 -i -
+# To use only one scan with default values, uncomment the next line instead
+#focr_scansets $gocr -i -
+# Some example for more advanced sets
+# Thisone uses the first the standard scan, then a scanset which first reduces the image to 3 colors and then scans it with custom settings
+# and then it scans again only with these custom settings
+# NOTE: This is for advanced users only, if you have questions how to use this, ask on the ML or on IRC
+#focr_scansets $gocr -i -, pnmnorm 2>$errfile | pnmquant 3 2>>$errfile | pnmnorm 2>>$errfile | $gocr -l 180 -d 2 -i -, $gocr -l 180 -d 2 -i -
+##### Various Score/Scan settings #####
+# Timeout for the plugin, in seconds. (Maximum runtime of the plugin) (Default value: 10)
+#focr_timeout 10
+focr_timeout 30
+# Default detection treshold (see manual) (Default value: 0.3) (Can be changed on a per word basis in the wordlist).
+#focr_threshold 0.3
+# This is the score for a hit after focr_counts_required matches
+#focr_base_score 4
+focr_base_score 1
+# This is the additional score for every additional match after focr_counts_required matches (Default value: 1)
+#focr_add_score 1
+focr_add_score 0.1
+# This is the score to give for a wrong content-type (e.g. JPEG image but content type says GIF) (Default value: 1.5)
+#focr_wrongctype_score 1.5
+focr_wrongctype_score 0.5
+# This is the score to give for a corrupted image (This currently affects only GIF images) (Default value: 2.5)
+#focr_corrupt_score 2.5
+focr_corrupt_score 0.5
+# This is the score to give for a corrupted unfixable image (This currently affects only GIF images) (Default value: 5)
+#focr_corrupt_unfixable_score 5
+focr_corrupt_unfixable_score 0.5
+# This is used to disable the OCR engine if the message has already more points than this value (Default value: 10)
+#focr_autodisable_score 10
+# Number of minimum matches before the rule scores (Default value: 2)
+#focr_counts_required 2
+# Specifies, how many frames an animated gif must contain, so the second (less resource consuming) animated gif test is used. (Default value: 5)
+#focr_gif_max_frames 5
+##### Image Hash Database settings (Experimental, disabled by default) #####
+# Set this to 1 to enable the Image Hash database feature (Default value: 0.0)
+#focr_enable_image_hashing 0.0
+# The score is saved with the hash in the database, so no extra scoring for a db hit is required.
+# If the image hash database feature is enabled, specify the file here to use as database (Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb)
+#focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb
+# Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1)
+#focr_hashing_learn_scanned 1
diff --git a/spamassassin/fuzzyocr/ b/spamassassin/fuzzyocr/
new file mode 100644
index 0000000..5f5f01b
--- /dev/null
+++ b/spamassassin/fuzzyocr/
@@ -0,0 +1,864 @@
+# FuzzyOcr plugin, version 2.3b
+# Changelog:
+# version 2.0
+# Replaced imagemagick with netpbm
+# Invoke giffix to fix broken gifs before conversion
+# Support png images
+# Analyze the file to detect the format without content-type
+# Added several configuration parameters
+# version 2.1
+# Added scoring for wrong content-type
+# Added scoring for broken gif images
+# Added configuration for helper applications
+# Added autodisable_score feature to disable the OCR engine if the message has already enough points
+# version 2.1b
+# Rule bugfix to avoid warnings
+# version 2.1c
+# Applied patch provided by Howard Kash to fix problems with spamassassin + Mailscanner + FuzzyOcr
+# Removed '-' from jpegtopnm arguments to provide backwards compatibility for older netpbm versions
+# Fixed typo (treshold -> threshold)
+# version 2.2
+# Small bugfix in content-type check for jpeg (jpg was not matching), thanks to Matthias Keller
+# Added more error handling
+# Removed debug files, added logfile instead
+# More messages with verbose = 2
+# version 2.3
+# Multiple scans with different pnm preprocessing and gocr arguments possible
+# Support for interlaced gifs
+# Support for animated gifs
+# Temporary file handling reorganized
+# External wordlist support
+# Personalized wordlist support
+# Spaces are now stripped from wordlist words and OCR results before matching
+# Experimental MD5 Database feature
+# version 2.3b
+# MD5 Database replaced by different feature database
+# Corrupted images are now handled better
+# Added a timeout function to avoid lockups
+# Added threshold overriding on word basis in wordlist
+# Various bugfixes
+# written by Christian Holler decoder_at_own-hero_dot_net
+# syslog support added by weasel, 2006-09-28
+package FuzzyOcr;
+use strict;
+use warnings;
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::Util;
+use Mail::SpamAssassin::Plugin;
+use Sys::Syslog;
+use String::Approx 'adistr';
+use FileHandle;
+use Fcntl ':flock';
+our @ISA = qw (Mail::SpamAssassin::Plugin);
+our @err_msges = (
+ "Failed to open pipe to external programs with pipe command \"%s\".
+Please check that all helper programs are installed and in the correct path.
+(Pipe Command \"%s\", Pipe exit code %d (\"%s\"), Temporary file: \"%s\")",
+ "Unexpected error in pipe to external programs.
+Please check that all helper programs are installed and in the correct path.
+(Pipe Command \"%s\", Pipe exit code %d (\"%s\"), Temporary file: \"%s\")",
+ "Cannot open \"%s\" to read previously produced data!
+(Previously used pipe: \"%s\", error code %d (\"%s\"), Temporary file: \"%s\")",
+ "Unexpected error while trying executing gocr with arguments \"%s\".
+Make sure the gocr location is specified correctly and the arguments are correct.",
+ "Failed to open global wordlist \"%s\" for reading.
+Please check that path and permissions are correct."
+our @words = ();
+our $self;
+our $pms;
+# Default values
+our $threshold = "0.3";
+our $base_score = "4";
+our $add_score = "1";
+our $wctypescore = "1.5";
+our $cimgscore = "2.5";
+our $cimgscore2 = "5";
+our $countreq = 2;
+our $verbose = 1;
+our $timeout = 10;
+our $pre314 = 0;
+our $enable_image_hashing = 0;
+our $hashing_learn_scanned = 1;
+our ($ts, $th, $tw, $tcn, $tc, $hash_ccnt) = (0.01, 0.01, 0.01, 0.01, 5, 5);
+our $giffix = "/usr/bin/giffix";
+our $giftext = "/usr/bin/giftext";
+our $gifasm = "/usr/bin/gifasm";
+our $gifinter = "/usr/bin/gifinter";
+our $giftopnm = "/usr/bin/giftopnm";
+our $jpegtopnm = "/usr/bin/jpegtopnm";
+our $pngtopnm = "/usr/bin/pngtopnm";
+our $pnmfile = "/usr/bin/pnmfile";
+our $ppmhist = "/usr/bin/ppmhist";
+our $convert = "/usr/bin/convert";
+our $identify = "/usr/bin/identify";
+our $gocr = "/usr/bin/gocr";
+our $grep = "/bin/grep";
+our $max_images = 5;
+our $dscore = 10;
+our $logfile = "/etc/mail/spamassassin/FuzzyOcr.log";
+our $pwordlist = ".spamassassin/fuzzyocr.words";
+our $digest_db = "/etc/mail/spamassassin/FuzzyOcr.hashdb";
+our @scansets = (
+ '$gocr -i -',
+ '$gocr -l 180 -d 2 -i -'
+# constructor: register the eval rule
+sub new {
+ my ( $class, $mailsa ) = @_;
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsa);
+ bless( $self, $class );
+ $self->register_eval_rule("fuzzyocr_check");
+ $self->register_eval_rule("dummy_check");
+ return $self;
+sub parse_config {
+ my ( $self, $opts ) = @_;
+ if ( $opts->{key} eq "focr_global_wordlist" ) {
+ load_global_words( $opts->{value} );
+ }
+ elsif ( $opts->{key} eq "focr_personal_wordlist" ) {
+ $pwordlist = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_threshold" ) {
+ $threshold = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_base_score" ) {
+ $base_score = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_add_score" ) {
+ $add_score = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_corrupt_score" ) {
+ $cimgscore = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_corrupt_unfixable_score" ) {
+ $cimgscore2 = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_wrongctype_score" ) {
+ $wctypescore = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_counts_required" ) {
+ $countreq = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_verbose" ) {
+ $verbose = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_timeout" ) {
+ $timeout = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_scansets" ) {
+ parse_scansets( $opts->{value} );
+ }
+ elsif ( $opts->{key} eq "focr_pre314" ) {
+ $pre314 = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_giffix" ) {
+ $giffix = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_giftext" ) {
+ $giftext = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_gifasm" ) {
+ $gifasm = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_gifinter" ) {
+ $gifinter = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_giftopnm" ) {
+ $giftopnm = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_jpegtopnm" ) {
+ $jpegtopnm = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_pngtopnm" ) {
+ $pngtopnm = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_ppmhist" ) {
+ $ppmhist = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_convert" ) {
+ $convert = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_identify" ) {
+ $identify = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_gocr" ) {
+ $gocr = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_grep" ) {
+ $grep = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_gif_max_frames" ) {
+ $max_images = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_autodisable_score" ) {
+ $dscore = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_enable_image_hashing" ) {
+ $enable_image_hashing = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_digest_db" ) {
+ $digest_db = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_hashing_learn_scanned" ) {
+ $hashing_learn_scanned = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_logfile" ) {
+ $logfile = $opts->{value};
+ }
+sub dummy_check {
+ return 0;
+sub fuzzyocr_check {
+ ( $self, $pms ) = @_;
+ my $t = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
+ $t->run(\&check_fuzzy_ocr);
+ if ($t->timed_out()) {
+ logfile("FuzzyOcr received timeout after running \"$timeout\" seconds.");
+ }
+ return 0;
+sub load_global_words {
+ unless ( -r $_[0] ) {
+ handle_error( $err_msges[3], ( $_[0] ) );
+ return;
+ }
+ open WORDLIST, "<$_[0]";
+ while(<WORDLIST>) {
+ chomp($_);
+ if (( $_ =~ /^[ \t]*#.*$/ ) or ( $_ =~ /^[^a-zA-Z]$/ )) {
+ next;
+ }
+ $_ =~ s/[ \t]*#.*$//;
+ push( @words, $_ );
+ }
+ close WORDLIST;
+ return 1;
+sub load_personal_words {
+ unless ( -e $_[0] ) {
+ debuglog("No personal wordlist found, skipping...");
+ return;
+ }
+ unless ( -r $_[0] ) {
+ debuglog(
+"Unable to read from wordlist \"$_[0]\", please make sure that permissions are correct."
+ );
+ return;
+ }
+ open WORDLIST, "<$_[0]";
+ while(<WORDLIST>) {
+ chomp($_);
+ if ( $_ =~ /^[ \t]*#.*$/ ) {
+ next;
+ }
+ $_ =~ s/[ \t]*#.*$//;
+ push( @words, $_ );
+ }
+ close WORDLIST;
+sub parse_scansets {
+ $_[0] =~ s/,[ ]*/,/g;
+ @scansets = split( ',', $_[0]);
+ debuglog( "Set scansets to values:\n" . join( "\n", @scansets ) );
+ return 1;
+sub max {
+ unless ( defined( $_[0] ) and defined( $_[1] ) ) { return 0 }
+ unless ( defined( $_[0] ) ) { return $_[1] }
+ unless ( defined( $_[1] ) ) { return $_[0] }
+ if ( $_[0] < $_[1] ) { return $_[1] }
+ else { return $_[0] }
+sub reorder {
+ my $tmp = join( '', @_ );
+ return split( '\n', $tmp );
+sub pipe_io {
+ my $pipecmd = shift;
+ my $input = shift;
+ my $filecount = 0;
+ my $silent = 0;
+ my $ignerror = 0;
+ my $tmpdir;
+ my @stdout = ();
+ my @stderr = ();
+ my ( $tmpfile, $tfilepath ) = Mail::SpamAssassin::Util::secure_tmpfile();
+ my ( $errfile, $efilepath ) = Mail::SpamAssassin::Util::secure_tmpfile();
+ close($tmpfile);
+ close($errfile);
+ if ($tmpfile eq $errfile) {
+ debuglog("Got same tmpfile twice! Aborting pipe_io() to avoid deadlocking");
+ return ( 1, \@stdout, \@stderr );
+ unlink($tmpfile);
+ }
+ if($pipecmd =~ /\$tmpdir/) {
+ $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir();
+ $pipecmd =~ s/\$tmpdir/$tmpdir/g;
+ $filecount = shift;
+ } else {
+ $silent = shift;
+ $ignerror = shift;
+ }
+ $pipecmd =~ s/\$errfile/$errfile/g;
+ my $pipe_pid = open( PIPE_IN, "| $pipecmd 1>$tmpfile 2>>$errfile" );
+ unless ($pipe_pid) {
+ unless($silent) {
+ handle_error( $err_msges[0], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
+ }
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( $?, \@stdout, \@stderr );
+ }
+ flock( PIPE_IN, LOCK_EX );
+ print PIPE_IN $input;
+ flock( PIPE_IN, LOCK_UN );
+ close(PIPE_IN);
+ if ($? and not $ignerror) {
+ unless($silent) {
+ handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
+ }
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( $?, \@stdout, \@stderr );
+ }
+ if ($filecount) {
+ my $tsize = 0;
+ my $tcount = 0;
+ foreach my $nr (0..$filecount-1) {
+ my $filesize = 0;
+ if ($nr < 10) {
+ $filesize = -s "$tmpdir/out0$nr.gif";
+ } else {
+ $filesize = -s "$tmpdir/out$nr.gif";
+ }
+ if ($filesize > $tsize) {
+ $tsize = $filesize;
+ $tcount = $nr;
+ }
+ }
+ if ($tcount < 10) {
+ open( PIPE_OUT, "< $tmpdir/out0$tcount.gif" );
+ } else {
+ open( PIPE_OUT, "< $tmpdir/out$tcount.gif" );
+ }
+ flock( PIPE_OUT, LOCK_EX );
+ @stdout = <PIPE_OUT>;
+ flock( PIPE_OUT, LOCK_UN );
+ close PIPE_OUT;
+ foreach my $nr (0..$filecount) {
+ if ($nr < 10) {
+ unlink("$tmpdir/out0$nr.gif");
+ } else {
+ unlink("$tmpdir/out$nr.gif");
+ }
+ }
+ rmdir($tmpdir);
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( 0, \@stdout, \@stderr );
+ } else {
+ unless (open( PIPE_OUT, "< $tmpfile" )
+ and open( PIPE_ERR, "< $errfile" ) )
+ {
+ unless($silent) {
+ handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
+ }
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( $?, \@stdout, \@stderr );
+ }
+ flock( PIPE_OUT, LOCK_EX );
+ flock( PIPE_ERR, LOCK_EX );
+ @stdout = <PIPE_OUT>;
+ @stderr = <PIPE_ERR>;
+ flock( PIPE_OUT, LOCK_UN );
+ flock( PIPE_ERR, LOCK_UN );
+ close(PIPE_OUT);
+ close(PIPE_ERR);
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( 0, \@stdout, \@stderr );
+ }
+sub handle_error {
+ my ( $err_msg, @var_vals ) = @_;
+ $err_msg = sprintf( $err_msg, @var_vals );
+ logfile($err_msg);
+sub logfile {
+ my $logtext = $_[0];
+ my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime(time);
+ my $time = sprintf(
+ "%4d-%02d-%02d %02d:%02d:%02d",
+ $year + 1900,
+ $mon + 1, $mday, $hour, $min, $sec
+ );
+ #$logtext =~ s/\n/\n /g;
+ #unless ( open LOGFILE, ">> $logfile" ) {
+ # warn "Can't open $logfile for writing, check permissions";
+ #}
+ #flock( LOGFILE, LOCK_EX );
+ #seek( LOGFILE, 0, 2 );
+ #print LOGFILE "[$time] $logtext\n";
+ #flock( LOGFILE, LOCK_UN );
+ #close LOGFILE;
+ openlog 'SA-FuzzyOCR', 'pid', 'mail';
+ for my $line (split /\n/, $logtext) {
+ syslog 'info', $line;
+ }
+ closelog;
+sub check_image_hash_db {
+ my $digest = $_[0];
+ my ($gpf, @gcf) = split('::', $digest);
+ my ($gs, $gh, $gw, $gcn) = split(':', $gpf);
+ unless(open(DB, "<$digest_db")) {
+ debuglog("No Image Hash database found at \"$digest_db\", or permissions wrong.");
+ return 0;
+ }
+ while(<DB>) {
+ chomp($_);
+ my ($score, $dpf, @dcf) = split('::', $_);
+ my ($ds, $dh, $dw, $dcn) = split(':', $dpf);
+ if ((abs($ds - $gs) / $gs) > $ts) { next; }
+ if ((abs($dh - $gh) / $gh) > $th) { next; }
+ if ((abs($dw - $gw) / $gw) > $tw) { next; }
+ if ((abs($dcn - $gcn) / $gcn) > $tcn) { next; }
+ my (@dcfs, @gcfs);
+ foreach (@dcf) {
+ push(@dcfs, split(':', $_));
+ }
+ foreach (@gcf) {
+ push(@gcfs, split(':', $_));
+ }
+ unless (scalar(@gcfs) eq scalar(@dcfs)) {
+ logfile("Error in database format, aborting...");
+ return 0;
+ }
+ foreach (0..scalar(@gcfs) - 1) {
+ if (abs($dcfs[$_] - $gcfs[$_]) > $tc) {
+ next;
+ }
+ }
+ return $score;
+ }
+ return 0;
+sub add_image_hash_db {
+ my $digest = shift;
+ my $score = shift;
+ my $ret;
+ if (-e $digest_db) {
+ $ret = open(DB, ">> $digest_db");
+ } else {
+ $ret = open(DB, "> $digest_db");
+ debuglog("Image Hash Database not found to add hash, creating it...");
+ }
+ unless ($ret) {
+ logfile("Unable to open/create Image Hash database at \"$digest_db\", check permissions.");
+ return;
+ }
+ debuglog("Adding hash \"$digest\" to Image Hash database...");
+ flock( DB, LOCK_EX );
+ seek( DB, 0, 2 );
+ print DB "${score}::${digest}\n";
+ flock( DB, LOCK_UN );
+ close(DB);
+sub calc_image_hash {
+ my ($rcode, $stdout_ref, $stderr_ref);
+ my $picdata = $_[0];
+ my ($hash, $h, $w);
+ my @ca = ( );
+ my $s = length ( $picdata );
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$identify -", $picdata);
+ foreach (@$stdout_ref) {
+ if ($_ =~ /([0-9]+)x([0-9]+)/) {
+ $h = $1;
+ $w = $2;
+ last;
+ }
+ }
+ if ($rcode) {
+ debuglog("Unable to calculate image hash, skipping...");
+ return ($rcode, $hash);
+ }
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$ppmhist -noheader", $picdata);
+ if ($rcode) {
+ debuglog("Unable to calculate image hash, skipping...");
+ return ($rcode, $hash);
+ }
+ my $cnt = 0;
+ my $c = scalar(@$stdout_ref);
+ if ($hash_ccnt) {
+ foreach (@$stdout_ref) {
+ $_ =~ s/ +/ /g;
+ my($r, $g, $b, $l, $c) = split(' ', $_);
+ push(@ca, "::$r:$g:$b:$l:$c");
+ $cnt++;
+ if ($cnt ge $hash_ccnt) {
+ last;
+ }
+ }
+ }
+ $hash = "$s:$h:$w:$c" . join('', @ca);
+ return(0, $hash);
+sub debuglog {
+ if ( $verbose > 1 ) {
+ logfile("Debug mode: $_[0]");
+ }
+sub wrong_ctype {
+ my ( $format, $ctype ) = @_;
+ if ($wctypescore) {
+ my $debuginfo = "";
+ if ( $verbose > 0 ) {
+ $debuginfo =
+ ("Image has format \"$format\" but content-type is \"$ctype\"");
+ debuglog($debuginfo);
+ }
+ for my $set ( 0 .. 3 ) {
+ $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} =
+ sprintf( "%0.3f", $wctypescore );
+ }
+ $pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $wctypescore, "BODY: ",
+ $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE}
+ . "\n$debuginfo" );
+ }
+sub corrupt_img {
+ my ($unfixable, $err) = @_;
+ my $score = 0;
+ if ($unfixable) {
+ $score = $cimgscore2;
+ } else {
+ $score = $cimgscore;
+ }
+ if ($score) {
+ my $debuginfo = "";
+ if ( $verbose > 0 ) {
+ chomp($err);
+ $debuginfo = ("Corrupt image: $err");
+ debuglog($debuginfo);
+ }
+ for my $set ( 0 .. 3 ) {
+ $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_CORRUPT_IMG"} =
+ sprintf( "%0.3f", $score );
+ }
+ $pms->_handle_hit( "FUZZY_OCR_CORRUPT_IMG", $score, "BODY: ",
+ $pms->{conf}->{descriptions}->{FUZZY_OCR_CORRUPT_IMG}
+ . "\n$debuginfo" );
+ }
+sub known_img_hash {
+ my $digest = shift;
+ my $score = shift;
+ my $debuginfo = "";
+ if ( $verbose > 0 ) {
+ $debuginfo = ("Hash \"$digest\" is in the database.");
+ debuglog($debuginfo);
+ }
+ for my $set ( 0 .. 3 ) {
+ $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} =
+ sprintf( "%0.3f", $score );
+ }
+ $pms->_handle_hit( "FUZZY_OCR_KNOWN_HASH", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_KNOWN_HASH} . "\n$debuginfo" );
+sub check_fuzzy_ocr {
+ my @found = ();
+ my $image_type = 0;
+ my $picture_data;
+ my @hashes = ();
+ my $cnt = 0;
+ my $homedir = (getpwuid($<))[7];
+ debuglog("Starting FuzzyOcr...");
+ debuglog("Attempting to load personal wordlist...");
+ if ($homedir) {
+ load_personal_words( $homedir . "/$pwordlist" );
+ } elsif (defined($ENV{HOME})) {
+ load_personal_words( $ENV{HOME} . "/$pwordlist" );
+ } else {
+ debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available...");
+ }
+ foreach my $p ( $pms->{msg}->find_parts(qr/^image\b/i) ) {
+ my $cscore = $pms->get_score();
+ if ( $cscore > $dscore ) {
+ debuglog(
+ "Scan canceled, message has already more than $dscore points.");
+ return 0;
+ }
+ my $ctype = $p->{'type'};
+ if ( $ctype =~ /image/i ) {
+ debuglog("Analyzing file with content-type \"$ctype\"");
+ $picture_data = $p->decode();
+ my @used_scansets = ();
+ my $stdout_ref;
+ my $stderr_ref;
+ my $rcode = 0;
+ my $corrupt = 0;
+ my $digest;
+ if ( substr($picture_data,0,3) eq "\x47\x49\x46" ) {
+ if ( $ctype !~ /gif/i ) {
+ wrong_ctype( "GIF", $ctype );
+ }
+ $image_type = 1;
+ my $interlaced_gif = 0;
+ my $image_count = 0;
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( $giftext, $picture_data, 1, 1);
+ foreach (@$stdout_ref) {
+ unless ($interlaced_gif) {
+ if ( $_ =~ /Image is Interlaced/i ) {
+ $interlaced_gif = 1;
+ }
+ elsif ( $_ =~ /Image is Non Interlaced/i ) {
+ }
+ }
+ if ( $_ =~ /^Image #/ ) {
+ $image_count++;
+ }
+ }
+ if ($interlaced_gif or ($image_count gt 1)) {
+ debuglog("Image is interlaced or animated...");
+ }
+ else {
+ debuglog("Image is single non-interlaced...");
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io( "$giffix", $picture_data, 0, 1);
+ $picture_data = join('', @$stdout_ref);
+ }
+ foreach (@$stderr_ref) {
+ if ( $_ =~ /GIF-LIB error/i ) {
+ $corrupt = $_;
+ last;
+ }
+ }
+ if ($corrupt and ($interlaced_gif or ($image_count gt 1))) {
+ debuglog("Skipping corrupted interlaced image...");
+ corrupt_img(1, $corrupt);
+ next;
+ } elsif ($corrupt) {
+ unless($picture_data) {
+ debuglog("Uncorrectable corruption detected, skipping non-interlaced image...");
+ corrupt_img(1, $corrupt);
+ next;
+ }
+ debuglog("Image is corrupt, but seems fixable, continuing...");
+ corrupt_img(0, $corrupt);
+ }
+ if ($image_count gt 1) {
+ debuglog("File contains more than one image...");
+ if ($image_count lt $max_images) {
+ debuglog("Assembling images...");
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$convert - +append -", $picture_data);
+ if ($rcode) { next; };
+ $picture_data = join('', @$stdout_ref);
+ } elsif ($pre314 eq 0) {
+ debuglog("Image count exceeds limit, skipping some...");
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$gifasm -d \$tmpdir/out", $picture_data, $image_count);
+ if ($rcode) { next; };
+ $picture_data = join('', @$stdout_ref);
+ } else {
+ debuglog("Image count exceeds limit, but your version does not allow the required functions, skipping image...");
+ next;
+ }
+ }
+ if ($interlaced_gif) {
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io(
+ "$gifinter -s | $giftopnm -", $picture_data );
+ if ($rcode) { next; }
+ }
+ else {
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( "$giftopnm -", $picture_data );
+ if ($rcode) { next; }
+ }
+ }
+ elsif ( substr($picture_data,0,2) eq "\xff\xd8" ) {
+ if ( $ctype !~ /jpe{0,1}g/i ) {
+ wrong_ctype( "JPEG", $ctype );
+ }
+ $image_type = 2;
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( "$jpegtopnm", $picture_data );
+ if ($rcode) { next; }
+ }
+ elsif ( substr($picture_data,0,4) eq "\x89\x50\x4e\x47" ) {
+ if ( $ctype !~ /png/i ) {
+ wrong_ctype( "PNG", $ctype );
+ }
+ $image_type = 3;
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( "$pngtopnm -", $picture_data );
+ if ($rcode) { next; }
+ }
+ else {
+ $image_type = 0;
+ debuglog(
+"Image type not recognized, unknown format. Skipping this image..."
+ );
+ next;
+ }
+ debuglog("Recognized file type: $image_type");
+ my @pnmdata = @$stdout_ref;
+ if($enable_image_hashing) {
+ debuglog("Calculating the image hash...");
+ ($rcode, $digest) = calc_image_hash(join('', @pnmdata));
+ if ($rcode) {
+ debuglog("Error calculating the image hash, skipping hash check...");
+ } else {
+ if (my $score = check_image_hash_db($digest)) {
+ debuglog("Image found in hash database, message is spam...");
+ debuglog("Scoring with known old score and ending...");
+ known_img_hash($digest, $score);
+ return 0;
+ }
+ }
+ debuglog("Hash not yet known to the database, saving for later db storage...");
+ push(@hashes, $digest);
+ } else {
+ debuglog("Image hashing disabled in configuration, skipping...");
+ }
+ my @ocr_results = ();
+ foreach my $scanset (@scansets) {
+ $scanset =~ s/\$gocr/$gocr/;
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( "$scanset", join( '', @pnmdata ), 1);
+ if ($rcode) {
+ debuglog(join( '', @$stderr_ref ));
+ debuglog(
+"Skipping scanset \"$scanset\" because of errors, trying next..."
+ );
+ next;
+ }
+ my @ocrdata = @$stdout_ref;
+ push( @ocr_results, [@ocrdata] );
+ push( @used_scansets, $scanset );
+ }
+ foreach my $w (@words) {
+ my $wthreshold;
+ if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) {
+ ($w, $wthreshold) = ($1, $2);
+ } else {
+ $wthreshold = $threshold;
+ }
+ $w =~ s/[^a-zA-Z]//g;
+ $w = lc $w;
+ my $wcnt = 0;
+ my $gcnt = 0;
+ foreach my $ocr_set (@ocr_results) {
+ my $cwcnt = 0;
+ foreach (@$ocr_set) {
+ tr/!;|081/iiioal/;
+ s/[^a-zA-Z]//g;
+ $_ = lc;
+ my $matched = adistr( $w, $_ );
+ if ( abs($matched) < $wthreshold ) {
+ $cwcnt++;
+ debuglog(
+"Found word \"$w\" in line\n \"$_\" \n with fuzz of "
+ . abs($matched)
+ . " scanned with scanset $used_scansets[$gcnt]"
+ );
+ }
+ }
+ $wcnt = max( $wcnt, $cwcnt );
+ $gcnt++;
+ }
+ $cnt += $wcnt;
+ if ( ( $verbose > 0 ) and ($wcnt) ) {
+ push( @found, "\"$w\" in $wcnt lines" );
+ }
+ }
+ }
+ }
+ if ( $cnt >= $countreq ) {
+ my $score = ( $base_score + ( $cnt - $countreq ) * $add_score );
+ if($enable_image_hashing and $hashing_learn_scanned) {
+ debuglog("Message is spam (score $score), storing all image hashes in database...");
+ foreach (@hashes) {
+ add_image_hash_db($_, $score);
+ }
+ } else {
+ debuglog("Message is spam (score $score)...");
+ }
+ my $debuginfo = "";
+ if ( $verbose > 0 ) {
+ $debuginfo =
+ ( "Words found:\n"
+ . join( "\n", @found )
+ . "\n($cnt word occurrences found)" );
+ debuglog($debuginfo);
+ }
+ for my $set ( 0 .. 3 ) {
+ $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR"} =
+ sprintf( "%0.3f", $score );
+ }
+ $pms->_handle_hit( "FUZZY_OCR", $score, "BODY: ",
+ $pms->{conf}->{descriptions}->{FUZZY_OCR} . "\n$debuginfo" );
+ }
+ debuglog("FuzzyOcr ending successfully...");
+ return 0;
diff --git a/spamassassin/fuzzyocr/FuzzyOcr.words b/spamassassin/fuzzyocr/FuzzyOcr.words
new file mode 100644
index 0000000..77ebf33
--- /dev/null
+++ b/spamassassin/fuzzyocr/FuzzyOcr.words
@@ -0,0 +1,45 @@
+# Here we defined the words to scan for
+# Stock
+# Pills
+# Misc
+click here
diff --git a/spamassassin/fuzzyocr/INSTALL b/spamassassin/fuzzyocr/INSTALL
new file mode 100644
index 0000000..672e37d
--- /dev/null
+++ b/spamassassin/fuzzyocr/INSTALL
@@ -0,0 +1,119 @@
+Installation manual for FuzzyOcr 2.3:
+1. Dependencies you require for this plugin to work
+ Before starting, also make sure to read the OS/distribution specific notes at the end of this section.
+ 1.1 Spamassassin 3.x
+ This plugin requires Spamassassin 3.x. Using it on version 2.x is not supported and might fail.
+ At least one function in this plugin requires Spamassassin 3.1.4, if you do not have this version,
+ don't forget to set the "focr_pre314" option in the file.
+ 1.2 NetPBM tools
+ Install the NetPBM tools ( If you don't install the binaries in /usr/bin,
+ please make sure to adjust the to point to the correct binaries.
+ 1.3 ImageMagick
+ At least one feature requires the convert binary from imagemagick (
+ Again, make sure the configuration file points to the convert binary, if not placed in /usr/bin.
+ 1.4 Giflib (also known as libungif)
+ Several tools from this package are required, see (
+ Attention: the giftext binary from this package has a bug which can cause segfaults.
+ On the download page, a source patch is provided which fixes this.
+ 1.5 Gocr
+ For OCR recognition, gocr ( must be installed.
+ Attention: the gocr binary has a bug which can cause segfaults with specific images.
+ On the download page, a source patch is provided which fixes this.
+ 1.6 Perl modules:
+ These perl modules are required:
+ Digest::MD5
+ String::Approx
+ Notes for Fedora Core 5 (or higher) users: The package libungif-utils provides the necessary libungif binaries.
+ Notes for other Redhat/FC users: The packages libungif and libungif-progs should be installed.
+ Notes for Debian users: The package libungif-bin provides the necessary libungif binaries.
+ Notes for Slackware users: I have no clue about this distro, but Andy Lyttle sent me a mail about it:
+ "Slackware doesn't currently have a libungif-utils/progs/bin package, and the libungif package does not include the binaries such as giffix. So, you have to hack it a bit.
+ 1. Download (or copy from CD) the /source/l/libungif directory, don't untar anything
+ 2. Edit the libungif.SlackBuild and comment out this line:
+ # I don't believe we need all this slop. Correct me if I'm wrong.
+ rm -rf $PKG/usr/bin
+ 3. Run "sh libungif.SlackBuild"
+ 4. Uninstall the libungif package, if it's already installed
+ 5. Look in /tmp, and install the new libungif package there"
+ Notes for Gentoo users: All dependencies except the perl modules can be installed via portage. But because of the bugs in giftext and gocr,
+ you might need to write an ebuild which uses the two patches found on my download page. The perl modules can easily be
+ installed with gcpan.
+2. Installing the plugin:
+ 2.1. Installing the required files
+ Put the and the files into /etc/mail/spamassassin.
+ The file already contains a line to load the plugin, if you want to put the .pm file in a different location,
+ change this line accordingly.
+ Create a wordlist file, a sample wordlist is shipped with this release, and put it also in /etc/mail/spamassassin.
+ 2.2 Necessary configuration
+ Open the Make sure that you specify a writable file as a logfile, or a directory where the plugin can write to,
+ so it can create the logfile itself. Also make sure that you specify a correct file as global wordlist.
+ With these two adjustments, FuzzyOcr is already to work.
+ 3. Further adjustments
+ 3.1. Enabling the image hash database
+ Set focr_enable_image_hashing to 1 in the config file, and make sure that focr_digest_db points to a writable file/directory.
+ You can also create this file yourself if you like. By default, all images recognized as spam, are added
+ to this database automatically. The score is saved as well and reused later again.
+ 3.2 Tweaking Scansets
+ Everyone gets different image spam, and most times, one method to scan is not successful with all types of spam you get.
+ That's where the focr_scansets setting can help you. This setting takes a comma seperated list of scansets.
+ Each scanset starts with the name of a program, followed by either other programs connected with pipes, or nothing anymore.
+ The only important thing is that input for this "program chain" is a picture in the PNM format, and the output is ASCII text.
+ An example might clarify this:
+ focr_scanset gocr -i -
+ This will do a single scan with gocr default settings.
+ focr_scanset pnminvert | gocr -i -
+ This will use pnminvert on the image and then do the scan.
+ focr_scanset gocr -i -, gocr -l 180 -i -
+ This will do 2 scans, one with the default settings, and the second one with a modified -l value.
+ You are now free to select which scansets get you the most spam, but don't pick too many, as this will also use more resources.
+ Here are some hints: -pnminvert or pnmquant are useful with white text or text with many colors
+ -If you get images which are littered with small dots/lines, try -d 2 as an argument to gocr
+ -The -l setting often helps, try values like 180, 140, or 100
+ Two syntax remarks: -Instead of writing "gocr", write "$gocr" as this will be replaced with the correct path to your gocr binary.
+ -If you invoke custom binaries (like pnminvert for example), you can redirect the stderr output by using:
+ "pnminvert 2>>$errfile"
+ If the scanset fails then, and debug logging is enabled, you will see this stderr output in the logfile :)
+ I know this seems confusing for some, but if this is unclear somehow, feel free to write an email to the list.
+And now, where it gets most thrilling...
+To be continued...
diff --git a/spamassassin/fuzzyocr/LICENSE b/spamassassin/fuzzyocr/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/spamassassin/fuzzyocr/LICENSE
@@ -0,0 +1,202 @@
+ Apache License
+ Version 2.0, January 2004
+ 1. Definitions.
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ implied, including, without limitation, any warranties or conditions
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+ APPENDIX: How to apply the Apache License to your work.
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+ Copyright [yyyy] [name of copyright owner]
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/spamassassin/fuzzyocr/README.weasel b/spamassassin/fuzzyocr/README.weasel
new file mode 100644
index 0000000..57ca565
--- /dev/null
+++ b/spamassassin/fuzzyocr/README.weasel
@@ -0,0 +1,2 @@
+Downloaded September 2006 from
diff --git a/spamassassin/fuzzyocr/samples/README b/spamassassin/fuzzyocr/samples/README
new file mode 100644
index 0000000..98370c4
--- /dev/null
+++ b/spamassassin/fuzzyocr/samples/README
@@ -0,0 +1,61 @@
+These eml files are sample spam emails to test your installation of FuzzyOCR. Assuming you are using the default settings, the output you get should match the output listed here.
+Use spamassassin -t < samplefile.eml to test :)
+corrupted-gif.eml: Contains a corrupted gif image, additionally I changed the content-type to jpeg, so the output should show:
+ 1.5 FUZZY_OCR_WRONG_CTYPE BODY: Mail contains an image with wrong
+ content-type set
+ Image has format "GIF" but content-type is
+ "image/jpeg"
+ 5.0 FUZZY_OCR_CORRUPT_IMG BODY: Mail contains a corrupted image
+ Corrupt image: GIF-LIB error: Image is
+ defective, decoding aborted.
+ 10 FUZZY_OCR BODY: Mail contains an image with common spam text inside
+ Words found:
+ "stock" in 2 lines
+ "investor" in 1 lines
+ "company" in 1 lines
+ "price" in 2 lines
+ "trade" in 1 lines
+ "service" in 1 lines
+ (8 word occurrences found)
+animated-gif.eml: Contains an animated gif with four frames. Both with default settings and with "focr_gif_max_frames 3" this should output:
+ 10 FUZZY_OCR BODY: Mail contains an image with common spam text inside
+ Words found:
+ "stock" in 2 lines
+ "company" in 3 lines
+ "trade" in 1 lines
+ "penis" in 1 lines
+ "growth" in 1 lines
+ (8 word occurrences found)
+Note: Please verify that this is the output both with the setting mentioned and without, because with this setting, a different test is used.
+jpeg.eml: Contains a jpeg file. Output should show:
+ 6.0 FUZZY_OCR BODY: Mail contains an image with common spam text inside
+ Words found:
+ "viagra" in 2 lines
+ "cialis" in 1 lines
+ "levitra" in 1 lines
+ (4 word occurrences found)
+png.eml: Contains a png file. Output should show:
+ 24 FUZZY_OCR BODY: Mail contains an image with common spam text inside
+ Words found:
+ "stock" in 1 lines
+ "investor" in 3 lines
+ "company" in 2 lines
+ "money" in 1 lines
+ "buy" in 1 lines
+ "price" in 6 lines
+ "trade" in 2 lines
+ "service" in 2 lines
+ "software" in 2 lines
+ "levitra" in 1 lines
+ "legal" in 1 lines
+ (22 word occurrences found)
diff --git a/spamassassin/fuzzyocr/samples/animated-gif.eml b/spamassassin/fuzzyocr/samples/animated-gif.eml
new file mode 100644
index 0000000..f9637c1
--- /dev/null
+++ b/spamassassin/fuzzyocr/samples/animated-gif.eml
@@ -0,0 +1,579 @@
+Return-Path: <>
+Received: by (Postfix, from userid 500)
+ id 5A723120C1BB; Tue, 22 Aug 2006 06:05:16 +0200 (CEST)
+X-Spam-Checker-Version: SpamAssassin 3.1.3-gr0 (2006-06-01) on
+X-Spam-Level: *
+X-Spam-Status: No, score=1.7 required=5.0 tests=DATE_IN_PAST_06_12,
+ version=3.1.3-gr0
+ * 0.8 EXTRA_MPART_TYPE Header has extraneous Content-type:...type= entry
+ * 0.7 DATE_IN_PAST_06_12 Date: is 6 to 12 hours before Received: date
+ * 0.1 TW_WY BODY: Odd Letter Triples with WY
+ * 0.1 TW_YX BODY: Odd Letter Triples with YX
+ * 0.0 HTML_MESSAGE BODY: HTML included in message
+Received: from ( [])
+ by (Postfix) with ESMTP id A0059120C1B7
+ for <>; Tue, 22 Aug 2006 06:05:08 +0200 (CEST)
+Received: from ( [])
+ by (Postfix) with ESMTP id 4E9C915A8190
+ for <>; Tue, 22 Aug 2006 06:04:44 +0200 (CEST)
+Received: from [] (helo=[])
+ by with esmtp (WEB.DE 4.107 #114)
+ id 1GFNVG-0008NZ-00
+ for; Tue, 22 Aug 2006 06:04:40 +0200
+Received: from [] (helo=[])
+ by [] with smtp (Exim 4.43)
+ id VKBWE-SZHY-NRO; Mon, 21 Aug 2006 23:03:24 +0500
+Message-ID: <001301c6c59f$ea022980$f255f53f@seguridad>
+From: "Contains Closed" <>
+Subject: Largest Wireless
+Date: Mon, 21 Aug 2006 23:03:24 +0500
+MIME-Version: 1.0
+Content-Type: multipart/related;
+ type="multipart/alternative";
+ boundary="----=_NextPart_000_000F_01C6C576.012C2180"
+X-Priority: 3
+X-MSMail-Priority: Normal
+X-Mailer: Microsoft Outlook Express 6.00.2800.1106
+X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1106
+Content-Type: multipart/alternative;
+ boundary="----=_NextPart_001_0010_01C6C576.012C2180"
+Content-Type: text/plain;
+ charset="windows-1250"
+Content-Transfer-Encoding: quoted-printable
+CSS MySQL dangerous myself glad switched sites learning do. kind =
+products Excel Mac somewhat amazing among team addon fulfilled thinking =
+rejoicing job making us.neuro Says: Nothing around.I themes for. kwyxz =
+aware Wordpress discuss way.I you. Dave. Todd none. constant wishing =
+bro.I grown shudder Apart Movable Type.That said please Those wonderful =
+helping thousands Nora Gatine Yikes stunned package. luck Japanese =
+Paris. When sister
+fuel fire. Yesterday quotAll Things
+Foreign Relations destroys Americas roots: September marks Week seven
+Basically corporate bear free. run. gentleman house primarily
+writers happy result. unlike unusual STABLE site. trying find monstrous =
+headache place. reasons addresses tend disappear speak publicly urge =
+guestto claims going steal titles. easily course. pirated. advance soon =
+discover amount hassle order somewhere excepting Libraryis hardly =
+effort. shape prepared formats. software already.
+Isles started attracted also. But... but... Nonsense Between remember =
+selling muchwere higher. precise: compared interim Well obviously cant =
+seems explains absurd claim somehow novels guarantee delighted spurt =
+noting Heart Darkness pattern. appeared copies. dramatic mere increased =
+declining Before move emphasize HARD FIGURES. stress quothard arguing =
+debate NEVER figures. Harlan Ellison screams quotLost piracybut best =
+knowledge tried true. endlessly pirated quotmust is. hurt. willare =
+convince different analogy captures reality better. Anyone bought carnew =
+usedknows perfectly
+Painting Religion Life Starving Genius Hour Living Chemistry Pictures =
+Delays XaRk Phil Hello Seahorse Aeroplano zoe Cosmetics Flaming Lips =
+Nortec NUDE Cosmonova
+Sigh Fucked World Political Ranting Too Caffeine
+location satirical Hobbit turns
+Jo... paragraph sidebar. Counter A:hover
+pirated. advance soon discover amount hassle order somewhere excepting =
+Libraryis hardly effort. shape prepared formats. software already. Try =
+finding ANY looked sloppy editions miserable read. yet... despite seen =
+yourself instances strong
+grey.Do iPods downI players loud. heavy metal asleep. seriously stare =
+blankly wry smile.Do disembark firstI Asians carriage straight seat =
+find... bowl process. hogI hate seats. variety ways: middle bag =
+audience. Most pass cases dont really writer arent willing keep
+looking WebPage Graphical Hit
+Release Rocket Pocket
+legs clearance inside
+Colophon manifesto Outside
+Hey unasa nosotros gustar estoy viendo placebo tambin
+elected partly backlash politics
+gesturing Such check empty
+Boones Farm
+year. hide faults arguments twoparty coalition been papering
+skinny knee front. quietAnd shut mean. shove. Reduce domain. services. =
+Engine SNES Nostalgia Dreamcast
+Content-Type: text/html;
+ charset="windows-1250"
+Content-Transfer-Encoding: quoted-printable
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<META http-equiv=3DContent-Type content=3D"text/html; =3D
+<META content=3D"MSHTML 6.00.2800.1106" name=3DGENERATOR>
+<BODY bgColor=3D#ffffff>
+<DIV align=3Dcenter><FONT face=3DArial size=3D2>
+<IMG alt=3D""
+hspace=3D0 src=3D"cid:000e01c6c59f$ea022980$f255f53f@seguridad"
+<DIV align=3Dleft><FONT face=3DArial size=3D2>CSS MySQL dangerous myself =
+glad switched sites learning do. kind products Excel Mac somewhat =
+amazing among team addon fulfilled thinking rejoicing job making =
+us.neuro Says: Nothing around.I themes for. kwyxz aware Wordpress =
+discuss way.I you. Dave. Todd none. constant wishing bro.I grown shudder =
+Apart Movable Type.That said please Those wonderful helping thousands =
+Nora Gatine Yikes stunned package. luck Japanese Paris. When =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>fuel fire. Yesterday =
+quotAll Things</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>CHOKOLATE GHOST WORL =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Foreign Relations destroys =
+Americas roots: September marks Week seven</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Basically corporate bear =
+free. run. gentleman house primarily</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>writers happy result. =
+unlike unusual STABLE site. trying find monstrous headache place. =
+reasons addresses tend disappear speak publicly urge guestto claims =
+going steal titles. easily course. pirated. advance soon discover amount =
+hassle order somewhere excepting Libraryis hardly effort. shape prepared =
+formats. software already.</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Isles started attracted =
+also. But... but... Nonsense Between remember selling muchwere higher. =
+precise: compared interim Well obviously cant seems explains absurd =
+claim somehow novels guarantee delighted spurt noting Heart Darkness =
+pattern. appeared copies. dramatic mere increased declining Before move =
+emphasize HARD FIGURES. stress quothard arguing debate NEVER figures. =
+Harlan Ellison screams quotLost piracybut best knowledge tried true. =
+endlessly pirated quotmust is. hurt. willare convince different analogy =
+captures reality better. Anyone bought carnew usedknows =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Painting Religion Life =
+Starving Genius Hour Living Chemistry Pictures Places</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Delays XaRk Phil Hello =
+Seahorse Aeroplano zoe Cosmetics Flaming Lips Nortec NUDE =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>GUSTAVO CERATI AUSTIN AZUL =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>MACHINES MASSIVE ATTACK =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Sigh Fucked World =
+Political Ranting Too Caffeine</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>location satirical Hobbit =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>mostly</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Jo... paragraph sidebar. =
+Counter A:hover</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>pirated. advance soon =
+discover amount hassle order somewhere excepting Libraryis hardly =
+effort. shape prepared formats. software already. Try finding ANY looked =
+sloppy editions miserable read. yet... despite seen yourself instances =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>grey.Do iPods downI =
+players loud. heavy metal asleep. seriously stare blankly wry smile.Do =
+disembark firstI Asians carriage straight seat find... bowl process. =
+hogI hate seats. variety ways: middle bag better...</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>audience. Most pass cases =
+dont really writer arent willing keep</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>looking WebPage Graphical =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Release Rocket =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>legs clearance =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Colophon manifesto =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Hey unasa nosotros gustar =
+estoy viendo placebo tambin</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>FOBIA CHETES =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>elected partly backlash =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>gesturing Such check =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>Boones Farm</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>year. hide faults =
+arguments twoparty coalition been papering</FONT></DIV>
+<DIV align=3Dleft><FONT face=3DArial size=3D2>OTRO PLOMOS FATALIST =
+<DIV align=3Dleft><FONT face=3DArial size=3D2>skinny knee front. =
+quietAnd shut mean. shove. Reduce domain. services. rorted</FONT></DIV>
+Content-Type: image/gif;
+ name="PILOTS.gif"
+Content-Transfer-Encoding: base64
+Content-ID: <000e01c6c59f$ea022980$f255f53f@seguridad>
diff --git a/spamassassin/fuzzyocr/samples/corrupted-gif.eml b/spamassassin/fuzzyocr/samples/corrupted-gif.eml
new file mode 100644
index 0000000..9cd564b
--- /dev/null
+++ b/spamassassin/fuzzyocr/samples/corrupted-gif.eml
@@ -0,0 +1,227 @@
+Return-Path: <>
+Received: from mellox ( [])
+ by (Postfix) with ESMTP id 3BA7A120C1AC
+ for <>; Tue, 8 Aug 2006 10:04:01 +0200 (CEST)
+Resent-From: Sascha Just <>
+Resent-To: Christian Holler <>
+Resent-Date: Tue, 8 Aug 2006 02:04:00 -0600
+Resent-Message-ID: <>
+Received: by (Postfix, from userid 500)
+ id 5C9B812080C2; Tue, 8 Aug 2006 02:22:37 +0200 (CEST)
+X-Spam-Checker-Version: SpamAssassin 3.1.3-gr0 (2006-06-01) on
+X-Spam-Status: No, score=0.0 required=5.0 tests=HTML_MESSAGE
+ autolearn=disabled version=3.1.3-gr0
+ * 0.0 HTML_MESSAGE BODY: HTML included in message
+Received: from ( [])
+ by (Postfix) with ESMTP id 0A142120C1AC
+ for <>; Tue, 8 Aug 2006 02:21:20 +0200 (CEST)
+Received: from ( [])
+ by (Postfix) with ESMTP id ED4F9FC29FF
+ for <>; Tue, 8 Aug 2006 02:21:19 +0200 (CEST)
+Received: from [] (
+ by with smtp (WEB.DE 4.107 #114)
+ id 1GAFL2-0007C4-00; Tue, 08 Aug 2006 02:20:55 +0200
+X-MID: <>
+Date: Mon, 07 Aug 2006 19:20:44 -0600
+Message-Id: <>
+From: Clifton Ballard <>
+Subject: To take on supplier
+MIME-Version: 1.0
+Content-Type: multipart/related;
+ boundary="----=_NextPart_000_00BD_01C5163C.78C70EB0"
+X-UID: 3354
+X-Length: 15790
+This is a multi-part message in MIME format.
+Content-Type: multipart/alternative;
+ boundary="----=_NextPart_001_00A0_01C5042E.12A32EA0"
+Content-Type: text/plain;
+ charset=us-ascii
+Content-Transfer-Encoding: 7Bit
+with a noiseless step, when we were close to them, addressed brains against - against mantelpieces, said my aunt; an idea which
+run up and down stairs for me, all day long. You never sit and again in the old country. Do not frown, Micawber. I do not now
+Content-Type: text/html;
+ charset=us-ascii
+Content-Transfer-Encoding: 7Bit
+<META http-equiv=Content-Type content="text/html; charset=us-ascii">
+<DIV><FONT face="Verdana" size=1><IMG ALT="" border="0" SRC=""></FONT></DIV>
+<DIV><FONT face="Verdana" size=1>Of the pair of hired post-horses being ready, and of Doras going</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>Put my meaning into any words you like, said I. You know what know; far from that. - but if you will sometimes think - just to</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>never should have been got into my present state if I hadnt come in my memory, pervaded it again. When dinner was done, Mr.</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>No, no. please. cried Dora, with a kiss, dont be a naughty Blue morning, when she requested an immediate settlement of the same,</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>have not known my partner, Mr. jorkins, as long as I have. Nothing Anywhere. Im a going to seek my niece through the wureld. Im</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>thats lived along with her and had her for their all in all, these is not in Traddless way. He is perfectly good-humoured respecting</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>he cherished an artistic admiration of their style of composition, being mentioned, I recognized it, however, and said as much.</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>finishing touch to that renunciation of mankind in which she had gracious to Peggotty, except when I inadvertently called her by</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>of him never wandering in that better mind of his to which stimulant, having little room in his system for any other article</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>finding himself as comfortable as he expected, or being a little perfect country gentleman to follow lustily with the same cry.</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>say as much for her bonnet and resumed her seat composedly. solicitude; and my poor mother herself could not have loved me</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>with us. Who knows when we may meet again, else? Come. Say the avoid the subject. All I desire, Mr. Copperfield, is, that it</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>she was so much disturbed in mind as to find it necessary to open Still, resumed Miss Murdstone, I found no proof until last</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>Yes. she said. I beg and pray that no one will leave the room. I cannot do with ampial satisfaction to my own feelings. But, in</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>every morning, took away our bread; and also how he himself had rest - and touched, on the horizon, with a strip of silvery light</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>said Mr. Chillip; but she is quite a shadow now. Would it be certain. Dont mistrust me. Our wants are not many. If I rent</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>slumbering echoes in the caverns of Memory awakened; what a kind and all gentlemen with anything bashful in their appearance, and</FONT></DIV>
+<DIV><FONT face="Verdana" size=1>to be useful to the family; and that if I got on in the world, and I hope its enough, child, said my aunt. If there had been more</FONT></DIV>
+Content-Type: image/jpeg;
+ name="sbillet"
+Content-Transfer-Encoding: base64
+Content-ID: <>
diff --git a/spamassassin/fuzzyocr/samples/jpeg.eml b/spamassassin/fuzzyocr/samples/jpeg.eml
new file mode 100644
index 0000000..d6e8b47
--- /dev/null
+++ b/spamassassin/fuzzyocr/samples/jpeg.eml
@@ -0,0 +1,426 @@
+X-Mozilla-Status: 0001
+X-Mozilla-Status2: 00000000
+Received: from [] (helo=localhost)
+ by with smtp (WEB.DE 4.107 #114)
+ id 1G8oLr-0001kH-00
+ for; Fri, 04 Aug 2006 03:19:48 +0200
+Message-ID: <000001c6b760$e0a08f00$0100007f@localhost>
+From: "Collin Cox" <>
+To: <>
+Subject: Hello
+Date: Fri, 04 Aug 2006 08:23:04 -0900
+MIME-Version: 1.0
+Content-Type: multipart/related;
+ boundary="----=_NextPart_000_0001_01C6B760.E0A08F00"
+X-Priority: 3 (Normal)
+X-MSMail-Priority: Normal
+X-Mailer: Microsoft Outlook, Build 10.0.3416
+Importance: Normal
+X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1506
+This is a multi-part message in MIME format.
+Content-Type: multipart/alternative;
+ boundary="----=_NextPart_001_000E_01C6B760.E0A08F00"
+Content-Type: text/plain;
+ charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+Langdon looked again at the fax an ancient myth confirmed in black and white.
+The implications were frightening. He gazed absently through the bay window.
+The first hint of dawn was sifting through the birch trees in his backyard,
+but the view looked somehow different this morning. As an odd combination of fear and
+exhilaration settled over him, Langdon knew he had no choice
+The man led Langdon the length of the hangar. They rounded the corner onto the runway.
+Content-Type: text/html;
+ charset="us-ascii"
+Content-Transfer-Encoding: quoted-printable
+<META HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html; charset=3Dus-ascii">
+<meta name=3DGenerator content=3D"Microsoft Word 10 (filtered)">
+ /* Style Definitions */
+ p.MsoNormal, li.MsoNormal, div.MsoNormal
+ {margin:0cm;
+ margin-bottom:.0001pt;
+ font-size:12.0pt;
+ font-family:"Times New Roman";}
+a:link, span.MsoHyperlink
+ {color:blue;
+ text-decoration:underline;}
+a:visited, span.MsoHyperlinkFollowed
+ {color:purple;
+ text-decoration:underline;}
+ {font-family:Arial;
+ color:windowtext;}
+@page Section1
+ {size:595.3pt 841.9pt;
+ margin:2.0cm 42.5pt 2.0cm 3.0cm;}
+ {page:Section1;}
+<body lang=3DRU link=3Dblue vlink=3Dpurple>
+<a href=3D""><img src=3D"cid:image001.jpg@01C671DF.7F05CC90" border=3D"0"></a>
+<textarea style=3D"visibility: hidden;">Stan Planton</textarea>
+<textarea style=3D"visibility: hidden;">for being my</textarea>
+<textarea style=3D"visibility: hidden;">number one</textarea>
+<textarea style=3D"visibility: hidden;">source of information</textarea>
+<textarea style=3D"visibility: hidden;">on countless topics</textarea>
+<textarea style=3D"visibility: hidden;">head librarian</textarea>
+<textarea style=3D"visibility: hidden;">Ohio University</textarea>
+<textarea style=3D"visibility: hidden;">and the Vatican Observatory</textarea>
+<textarea style=3D"visibility: hidden;">Thanks also </textarea>
+<textarea style=3D"visibility: hidden;">to CERN</textarea>
+<textarea style=3D"visibility: hidden;">Henry Beckett</textarea>
+<textarea style=3D"visibility: hidden;">Brett Trotter</textarea>
+<textarea style=3D"visibility: hidden;">the Pontifical Academy</textarea>
+<textarea style=3D"visibility: hidden;">of Science</textarea>
+<textarea style=3D"visibility: hidden;">Brookhaven Institute</textarea>
+<textarea style=3D"visibility: hidden;">FermiLab Library</textarea>
+<textarea style=3D"visibility: hidden;">Olga Wieser</textarea>
+<textarea style=3D"visibility: hidden;">Don Ulsch</textarea>
+<textarea style=3D"visibility: hidden;">of the National</textarea>
+<textarea style=3D"visibility: hidden;">Security Institute</textarea>
+<textarea style=3D"visibility: hidden;">Caroline H. Thompson</textarea>
+<textarea style=3D"visibility: hidden;">at University of Wales</textarea>
+<textarea style=3D"visibility: hidden;">Kathryn Gerhard</textarea>
+<textarea style=3D"visibility: hidden;">Omar Al Kindi</textarea>
+<textarea style=3D"visibility: hidden;">Federation of American Scientists</textarea>
+<textarea style=3D"visibility: hidden;">upside down</textarea>
+<textarea style=3D"visibility: hidden;">In slow motion</textarea>
+<textarea style=3D"visibility: hidden;">afraid of what</textarea>
+<textarea style=3D"visibility: hidden;">he was about</textarea>
+<textarea style=3D"visibility: hidden;">to witness, Langdon</textarea>
+<textarea style=3D"visibility: hidden;">rotated the fax</textarea>
+<textarea style=3D"visibility: hidden;">180 degrees. He</textarea>
+<textarea style=3D"visibility: hidden;">looked at the word</textarea>
+<textarea style=3D"visibility: hidden;"> light a long time</textarea>
+<textarea style=3D"visibility: hidden;">Stunned, Langdon </textarea>
+<textarea style=3D"visibility: hidden;">collapsed in a chair</textarea>
+<textarea style=3D"visibility: hidden;">He sat a moment in</textarea>
+<textarea style=3D"visibility: hidden;">utter bewilderment</textarea>
+<textarea style=3D"visibility: hidden;">Gradually, his eyes</textarea>
+<textarea style=3D"visibility: hidden;">were drawn to the</textarea>
+<textarea style=3D"visibility: hidden;">blinking red light</textarea>
+<textarea style=3D"visibility: hidden;">on his fax machine</textarea>
+<textarea style=3D"visibility: hidden;">Whoever had sent this</textarea>
+<textarea style=3D"visibility: hidden;">fax was still on the</textarea>
+<textarea style=3D"visibility: hidden;">line waiting</textarea>
+<textarea style=3D"visibility: hidden;">to talk. Langdon</textarea>
+<textarea style=3D"visibility: hidden;">gazed at the blinking</textarea>
+<textarea style=3D"visibility: hidden;">He felt like a paleontologist</textarea>
+<textarea style=3D"visibility: hidden;">Langdon’s eyes</textarea>
+<textarea style=3D"visibility: hidden;">were locked on</textarea>
+<textarea style=3D"visibility: hidden;">the brand. Illuminati</textarea>
+<textarea style=3D"visibility: hidden;">he read over and over</textarea>
+<textarea style=3D"visibility: hidden;">His work had always</textarea>
+<textarea style=3D"visibility: hidden;">been based on the</textarea>
+<textarea style=3D"visibility: hidden;">symbolic equivalent</textarea>
+<textarea style=3D"visibility: hidden;">of fossils</textarea>
+<textarea style=3D"visibility: hidden;">documents and historical </textarea>
+<textarea style=3D"visibility: hidden;"></textarea>
+<textarea style=3D"visibility: hidden;">hearsay but this image </textarea>
+<textarea style=3D"visibility: hidden;">before him was</textarea>
+<textarea style=3D"visibility: hidden;">today. Present tense</textarea>
+Content-Type: image/jpeg;
+ name="image001.jpg"
+Content-Transfer-Encoding: base64
+Content-ID: <image001.jpg@01C671DF.7F05CC90>
diff --git a/spamassassin/fuzzyocr/samples/png.eml b/spamassassin/fuzzyocr/samples/png.eml
new file mode 100644
index 0000000..3704c21
--- /dev/null
+++ b/spamassassin/fuzzyocr/samples/png.eml
@@ -0,0 +1,318 @@
+X-Mozilla-Status: 0001
+X-Mozilla-Status2: 00000000
+Received: from [] (
+ by with esmtp (WEB.DE 4.107 #114)
+ id 1G8VvK-0000PU-00
+ for; Thu, 03 Aug 2006 07:39:10 +0200
+Received: by (Postfix, from userid 2527)
+ id 772B56574038; Thu, 3 Aug 2006 07:39:10 +0200 (CEST)
+X-Spam-Flag: YES
+X-Spam-Checker-Version: SpamAssassin 3.1.3 (2006-06-01) on
+X-Spam-Level: ************
+X-Spam-Status: Yes, score=12.8 required=1.5 tests=BAYES_50,
+ HTML_MESSAGE,LONGWORDS,MSGID_FROM_MTA_ID autolearn=no version=3.1.3
+ * 1.1 EXTRA_MPART_TYPE Header has extraneous Content-type:...type= entry
+ * 1.4 MSGID_FROM_MTA_ID Message-Id for external message added locally
+ * 2.0 DATE_IN_FUTURE_03_06 Date: is 3 to 6 hours after Received: date
+ * 0.5 HTML_40_50 BODY: Message is 40% to 50% HTML
+ * 0.0 HTML_MESSAGE BODY: HTML included in message
+ * 0.0 BAYES_50 BODY: Bayesian spam probability is 40 to 60%
+ * [score: 0.4859]
+ * 3.8 LONGWORDS Long string of long words
+ * 4.1 FORGED_MUA_OUTLOOK Forged mail pretending to be from MS Outlook
+Received: from thuan-bf45dbaa6 (unknown [])
+ by (Postfix) with ESMTP id 4E4E36574037
+ for <>; Thu, 3 Aug 2006 07:39:03 +0200 (CEST)
+From: "Beatrice Cole" <>
+To: <>
+Subject: Your monte-jus
+Date: Thu, 3 Aug 2006 05:39:04 -0420
+MIME-Version: 1.0
+Content-Type: multipart/related;
+ type="multipart/alternative";
+ boundary="----=_NextPart_000_006A_01C6B6F9.CDE95250"
+X-Priority: 3
+X-MSMail-Priority: Normal
+X-Mailer: Microsoft Outlook Express 6.00.2720.3000
+X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2720.3000
+Message-Id: <>
+This is a multi-part message in MIME format.
+Content-Type: multipart/alternative;
+ boundary="----=_NextPart_001_006B_01C6B6F9.CDE95250"
+Content-Type: text/plain;
+ charset="iso-8859-2"
+Content-Transfer-Encoding: quoted-printable
+ his webbed feet, lifted=20
+his beak, and strained to hold a painful hardwant him. Or is it all=20
+right?"know my opinion of the incomparable Godi Muller?" Pmber, =20
+ We recently reI wouldn't be going into the Zone that day.=20
+But what would be the nicest wayunt, and suspect that yoone. Watch where=20
+it goes and don't take your eyes off it again."accessesee it. The man=20
+thinks he knows and understands the Zone completely. Thatbars on the=20
+windows just like a police station. Willy was sitting at=20
+hislieutenants. I don't like those trucks! They've been exposed to the=20
+elementsd by an unauthoripassion-proudnet tonpaper-fillednote shaverN=20
+electronrd party. Protthings any lighter. In fact, the bastards made it=20
+seem even darker. And now, "Here," he said. "From a grateful=20
+humanity.ecting the seo "OK," I said, "we had our =20
+fun, now let's go. Watch closely. I'mwould be in the way as far as=20
+he was concerned. We would run down, just theunt and o "That's what=20
+we in Harmont call the thieves who risk their lives in thencern. =20
+ Therefore, as prevention measure, because=20
+he looked away immediately.were parked on the paved lot next to it. He=20
+was right about the trucks--hisAnd as soon as he was quiet, I heard it.=20
+Trrr, trrr, trrr... =B7 Kirill lookedcount features.We encourage would=20
+shiver with ecstasy. For now, as I saw it, he had a long way to go. He =20
+ A word of appreciation must be extended to Ms. Antonina W. Bouis,=20
+the Paynickel steelpaper knifeOld guardnigh-naked=20
+ has asspaper-usingmorro castlemilk testeroak treenique tracmotuca=20
+flymonk sakipassenger locomotiveoak thistleking number. =20
+ brickhard sea.: by knowing that you have already=20
+arrived ..." getting a start on building his own=20
+bitter hell out on the Far Cliffs. And uniquwanted to sec Guta right=20
+then and there. just like that. To look at her andwanted to shout=20
+"Stop! Freeze!" but I couldn't. And I probably wouldn't havee Ufour at a=20
+time, not even letting me finish my smoke. In short, I told hims: =20
+ =20
+ For more i "Well, what happens from here?=20
+Where are we going? Is there no suchreturn to the Flock.concentration,=20
+held his breath, forced one... single... more... inch...nformation =20
+ like an electric shock. "Wait, Kirill. Come out here."of the=20
+entire Shame in the sight of your fellow gulls!"attaining new peaks =20
+in Germany and the Netherlands. New writers arenot Heinlein or =20
+Bradbury or Clarke, but Stanislaw Lem, a Pole; that theem. Thank=20
+yoYou won't forget it.their practicing and their striving to understand=20
+ more of the perfectwell. They can help you bring the newcomers=20
+along."u for your=20
+proNon-brahmanicalnarrow-headedmonkey-balloff-goNon-germanis matter. =20
+Content-Type: text/html;
+ charset="iso-8859-2"
+Content-Transfer-Encoding: quoted-printable
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<META http-equiv=3DContent-Type content=3D"text/html; charset=3Diso-8859-2">
+<META content=3D"MSHTML 6.00.2720.3000" name=3DGENERATOR>
+<body bgColor=3D#ffffff> <table cellspacing=3D0 cellpadding=3D0=20
+width=3D600 align=3Dcenter border=3D0 id=3Dtable21>
+ <tr=20
+ <font size=3D2><IMG alt=3D"" hspace=3D0=20
+src=3D"cid:006901c6b6bf$218a7a50$6c822ecf@DQ72967B" align=3Dbaseline=20
+border=3D0></font></a><font size=3D2>
+ </font></td></div> </td> <td=20
+width=3D"85%"> <font face=3D"Trebuchet MS" size=3D2=20
+color=3D"#006699"><br> his webbed feet, lifted his beak, and=20
+strained to hold a painful hardwant him. Or is it all right?"know=20
+my opinion of the incomparable Godi Muller?" <b>Pmber</b>,<br> =20
+ <br> We recently reI wouldn't be going into the Zone that=20
+day. But what would be the nicest wayunt, and suspect that yoone. Watch=20
+where it goes and don't take your eyes off it again."accessesee it. The =20
+man thinks he knows and understands the Zone completely. Thatbars on=20
+the windows just like a police station. Willy was sitting at=20
+hislieutenants. I don't like those trucks! They've been exposed to the=20
+elementsd by an unauthoripassion-proudnet tonpaper-fillednote shaverN=20
+electronrd party. Protthings any lighter. In fact, the bastards made it=20
+seem even darker. And now, "Here," he said. "From a grateful=20
+humanity.ecting the seo "OK," I said, "we had our =20
+fun, now let's go. Watch closely. I'mwould be in the way as far as=20
+he was concerned. We would run down, just theunt and o "That's what=20
+we in Harmont call the thieves who risk their lives in=20
+thencern.</font></td> </tr> <tr> <td=20
+colspan=3D2><font face=3D"Trebuchet MS" size=3D2=20
+color=3D"#006699">Therefore, <b>as prevention measure,=20
+because he looked away immediately.were parked on the paved lot next to=20
+it. He was right about the trucks--hisAnd as soon as he was quiet, I=20
+heard it. Trrr, trrr, trrr... =B7 Kirill lookedcount features</b>.We=20
+encourage would shiver with ecstasy. For now, as I saw it, he had a long=20
+way to go. He A word of appreciation must be extended to Ms. =20
+Antonina W. Bouis, the<br> <br> </font><font=20
+face=3D"Arial, Helvetica, sans-serif" size=3D2 color=3D"#006699">P<font=20
+face=3D"Trebuchet MS">aynickel steelpaper knifeOld guardnigh-naked =20
+has asspaper-usingmorro castlemilk testeroak treenique tracmotuca=20
+flymonk sakipassenger locomotiveoak thistleking=20
+number.</font></font><font face=3D"Trebuchet MS" size=3D2=20
+color=3D"#006699"> <br> </font><font=20
+face=3D"Arial, Helvetica, sans-serif" size=3D2=20
+color=3D"#006699"><b>brickhard sea.:<br> </b><font=20
+face=3D"Arial, Helvetica, sans-serif" size=3D2 color=3D"#006699"><font=20
+color=3D"#006600"><b>by knowing that you have already arrived=20
+."</b></font></font></font><font face=3D"Trebuchet MS" size=3D2=20
+color=3D"#006699"><br> <br> getting a start on=20
+building his own bitter hell out on the Far Cliffs. And uniquwanted to=20
+sec Guta right then and there. just like that. To look at her andwanted=20
+to shout "Stop! Freeze!" but I couldn't. And I probably wouldn't havee=20
+Ufour at a time, not even letting me finish my smoke. In short, I=20
+told hims:</font> <br>
+ <br>
+ <br>
+ <br> <div align=3Dleft> =20
+ </font><br> </div> </form> =20
+<font face=3D"Trebuchet MS" size=3D2 color=3D"#006699">For more i =20
+"Well, what happens from here? Where are we going? Is there no =20
+suchreturn to the Flock.concentration, held his breath, forced one... =20
+single... more... inch...nformation like an electric shock.=20
+"Wait, Kirill. Come out here."of the entire Shame in the sight of your=20
+fellow gulls!"attaining new peaks in Germany and the Netherlands.=20
+New writers arenot Heinlein or Bradbury or Clarke, but Stanislaw=20
+Lem, a Pole; that theem. Thank yoYou won't forget it.their practicing=20
+and their striving to understand more of the perfectwell. They can=20
+help you bring the newcomers along."u for your=20
+proNon-brahmanicalnarrow-headedmonkey-balloff-goNon-germanis matter.<br>=20
+ <br>
+<font color=3D"#000000"></div>
+Content-Type: image/png;
+ name="4WQUDM.PNG"
+Content-Transfer-Encoding: base64
+Content-ID: <006901c6b6bf$218a7a50$6c822ecf@DQ72967B>