diff options
Diffstat (limited to 'spamassassin/fuzzyocr/FuzzyOcr.pm')
-rw-r--r-- | spamassassin/fuzzyocr/FuzzyOcr.pm | 864 |
1 files changed, 864 insertions, 0 deletions
diff --git a/spamassassin/fuzzyocr/FuzzyOcr.pm b/spamassassin/fuzzyocr/FuzzyOcr.pm new file mode 100644 index 0000000..5f5f01b --- /dev/null +++ b/spamassassin/fuzzyocr/FuzzyOcr.pm @@ -0,0 +1,864 @@ +# FuzzyOcr plugin, version 2.3b +# Changelog: +# version 2.0 +# Replaced imagemagick with netpbm +# Invoke giffix to fix broken gifs before conversion +# Support png images +# Analyze the file to detect the format without content-type +# Added several configuration parameters +# version 2.1 +# Added scoring for wrong content-type +# Added scoring for broken gif images +# Added configuration for helper applications +# Added autodisable_score feature to disable the OCR engine if the message has already enough points +# version 2.1b +# Rule bugfix to avoid warnings +# version 2.1c +# Applied patch provided by Howard Kash to fix problems with spamassassin + Mailscanner + FuzzyOcr +# Removed '-' from jpegtopnm arguments to provide backwards compatibility for older netpbm versions +# Fixed typo (treshold -> threshold) +# version 2.2 +# Small bugfix in content-type check for jpeg (jpg was not matching), thanks to Matthias Keller +# Added more error handling +# Removed debug files, added logfile instead +# More messages with verbose = 2 +# version 2.3 +# Multiple scans with different pnm preprocessing and gocr arguments possible +# Support for interlaced gifs +# Support for animated gifs +# Temporary file handling reorganized +# External wordlist support +# Personalized wordlist support +# Spaces are now stripped from wordlist words and OCR results before matching +# Experimental MD5 Database feature +# version 2.3b +# MD5 Database replaced by different feature database +# Corrupted images are now handled better +# Added a timeout function to avoid lockups +# Added threshold overriding on word basis in wordlist +# Various bugfixes +# +# +# written by Christian Holler decoder_at_own-hero_dot_net + +# syslog support added by weasel, 2006-09-28 +# +package FuzzyOcr; + +use strict; +use warnings; +use Mail::SpamAssassin; +use Mail::SpamAssassin::Util; +use Mail::SpamAssassin::Plugin; +use Sys::Syslog; + +use String::Approx 'adistr'; + +use FileHandle; +use Fcntl ':flock'; + +our @ISA = qw (Mail::SpamAssassin::Plugin); + +our @err_msges = ( + "Failed to open pipe to external programs with pipe command \"%s\". +Please check that all helper programs are installed and in the correct path. +(Pipe Command \"%s\", Pipe exit code %d (\"%s\"), Temporary file: \"%s\")", + "Unexpected error in pipe to external programs. +Please check that all helper programs are installed and in the correct path. +(Pipe Command \"%s\", Pipe exit code %d (\"%s\"), Temporary file: \"%s\")", + "Cannot open \"%s\" to read previously produced data! +(Previously used pipe: \"%s\", error code %d (\"%s\"), Temporary file: \"%s\")", + "Unexpected error while trying executing gocr with arguments \"%s\". +Make sure the gocr location is specified correctly and the arguments are correct.", + "Failed to open global wordlist \"%s\" for reading. +Please check that path and permissions are correct." +); + +our @words = (); +our $self; +our $pms; + +# Default values +our $threshold = "0.3"; +our $base_score = "4"; +our $add_score = "1"; +our $wctypescore = "1.5"; +our $cimgscore = "2.5"; +our $cimgscore2 = "5"; +our $countreq = 2; +our $verbose = 1; +our $timeout = 10; +our $pre314 = 0; +our $enable_image_hashing = 0; +our $hashing_learn_scanned = 1; +our ($ts, $th, $tw, $tcn, $tc, $hash_ccnt) = (0.01, 0.01, 0.01, 0.01, 5, 5); +our $giffix = "/usr/bin/giffix"; +our $giftext = "/usr/bin/giftext"; +our $gifasm = "/usr/bin/gifasm"; +our $gifinter = "/usr/bin/gifinter"; +our $giftopnm = "/usr/bin/giftopnm"; +our $jpegtopnm = "/usr/bin/jpegtopnm"; +our $pngtopnm = "/usr/bin/pngtopnm"; +our $pnmfile = "/usr/bin/pnmfile"; +our $ppmhist = "/usr/bin/ppmhist"; +our $convert = "/usr/bin/convert"; +our $identify = "/usr/bin/identify"; +our $gocr = "/usr/bin/gocr"; +our $grep = "/bin/grep"; +our $max_images = 5; +our $dscore = 10; +our $logfile = "/etc/mail/spamassassin/FuzzyOcr.log"; +our $pwordlist = ".spamassassin/fuzzyocr.words"; +our $digest_db = "/etc/mail/spamassassin/FuzzyOcr.hashdb"; +our @scansets = ( + '$gocr -i -', + '$gocr -l 180 -d 2 -i -' +); + +# constructor: register the eval rule +sub new { + my ( $class, $mailsa ) = @_; + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsa); + bless( $self, $class ); + $self->register_eval_rule("fuzzyocr_check"); + $self->register_eval_rule("dummy_check"); + return $self; +} + +sub parse_config { + my ( $self, $opts ) = @_; + if ( $opts->{key} eq "focr_global_wordlist" ) { + load_global_words( $opts->{value} ); + } + elsif ( $opts->{key} eq "focr_personal_wordlist" ) { + $pwordlist = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_threshold" ) { + $threshold = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_base_score" ) { + $base_score = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_add_score" ) { + $add_score = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_corrupt_score" ) { + $cimgscore = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_corrupt_unfixable_score" ) { + $cimgscore2 = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_wrongctype_score" ) { + $wctypescore = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_counts_required" ) { + $countreq = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_verbose" ) { + $verbose = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_timeout" ) { + $timeout = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_scansets" ) { + parse_scansets( $opts->{value} ); + } + elsif ( $opts->{key} eq "focr_pre314" ) { + $pre314 = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_giffix" ) { + $giffix = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_giftext" ) { + $giftext = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_gifasm" ) { + $gifasm = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_gifinter" ) { + $gifinter = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_giftopnm" ) { + $giftopnm = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_jpegtopnm" ) { + $jpegtopnm = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_pngtopnm" ) { + $pngtopnm = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_ppmhist" ) { + $ppmhist = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_convert" ) { + $convert = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_identify" ) { + $identify = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_gocr" ) { + $gocr = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_bin_grep" ) { + $grep = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_gif_max_frames" ) { + $max_images = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_autodisable_score" ) { + $dscore = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_enable_image_hashing" ) { + $enable_image_hashing = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_digest_db" ) { + $digest_db = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_hashing_learn_scanned" ) { + $hashing_learn_scanned = $opts->{value}; + } + elsif ( $opts->{key} eq "focr_logfile" ) { + $logfile = $opts->{value}; + } +} + +sub dummy_check { + return 0; +} + +sub fuzzyocr_check { + ( $self, $pms ) = @_; + my $t = Mail::SpamAssassin::Timeout->new({ secs => $timeout }); + $t->run(\&check_fuzzy_ocr); + if ($t->timed_out()) { + logfile("FuzzyOcr received timeout after running \"$timeout\" seconds."); + } + return 0; +} + +sub load_global_words { + unless ( -r $_[0] ) { + handle_error( $err_msges[3], ( $_[0] ) ); + return; + } + open WORDLIST, "<$_[0]"; + while(<WORDLIST>) { + chomp($_); + if (( $_ =~ /^[ \t]*#.*$/ ) or ( $_ =~ /^[^a-zA-Z]$/ )) { + next; + } + $_ =~ s/[ \t]*#.*$//; + push( @words, $_ ); + } + close WORDLIST; + return 1; +} + +sub load_personal_words { + unless ( -e $_[0] ) { + debuglog("No personal wordlist found, skipping..."); + return; + } + unless ( -r $_[0] ) { + debuglog( +"Unable to read from wordlist \"$_[0]\", please make sure that permissions are correct." + ); + return; + } + open WORDLIST, "<$_[0]"; + while(<WORDLIST>) { + chomp($_); + if ( $_ =~ /^[ \t]*#.*$/ ) { + next; + } + $_ =~ s/[ \t]*#.*$//; + push( @words, $_ ); + } + close WORDLIST; +} + +sub parse_scansets { + $_[0] =~ s/,[ ]*/,/g; + @scansets = split( ',', $_[0]); + debuglog( "Set scansets to values:\n" . join( "\n", @scansets ) ); + return 1; +} + +sub max { + unless ( defined( $_[0] ) and defined( $_[1] ) ) { return 0 } + unless ( defined( $_[0] ) ) { return $_[1] } + unless ( defined( $_[1] ) ) { return $_[0] } + if ( $_[0] < $_[1] ) { return $_[1] } + else { return $_[0] } +} + +sub reorder { + my $tmp = join( '', @_ ); + return split( '\n', $tmp ); +} + +sub pipe_io { + $SIG{PIPE} = 'IGNORE'; + my $pipecmd = shift; + my $input = shift; + my $filecount = 0; + my $silent = 0; + my $ignerror = 0; + my $tmpdir; + my @stdout = (); + my @stderr = (); + my ( $tmpfile, $tfilepath ) = Mail::SpamAssassin::Util::secure_tmpfile(); + my ( $errfile, $efilepath ) = Mail::SpamAssassin::Util::secure_tmpfile(); + close($tmpfile); + close($errfile); + if ($tmpfile eq $errfile) { + debuglog("Got same tmpfile twice! Aborting pipe_io() to avoid deadlocking"); + return ( 1, \@stdout, \@stderr ); + unlink($tmpfile); + } + + if($pipecmd =~ /\$tmpdir/) { + $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir(); + $pipecmd =~ s/\$tmpdir/$tmpdir/g; + $filecount = shift; + } else { + $silent = shift; + $ignerror = shift; + } + + $pipecmd =~ s/\$errfile/$errfile/g; + my $pipe_pid = open( PIPE_IN, "| $pipecmd 1>$tmpfile 2>>$errfile" ); + + unless ($pipe_pid) { + unless($silent) { + handle_error( $err_msges[0], ( $pipecmd, $? >> 8, $!, $tmpfile ) ); + } + unlink($tmpfile); + unlink($errfile); + return ( $?, \@stdout, \@stderr ); + } + flock( PIPE_IN, LOCK_EX ); + print PIPE_IN $input; + flock( PIPE_IN, LOCK_UN ); + close(PIPE_IN); + if ($? and not $ignerror) { + unless($silent) { + handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) ); + } + unlink($tmpfile); + unlink($errfile); + return ( $?, \@stdout, \@stderr ); + } + if ($filecount) { + my $tsize = 0; + my $tcount = 0; + foreach my $nr (0..$filecount-1) { + my $filesize = 0; + if ($nr < 10) { + $filesize = -s "$tmpdir/out0$nr.gif"; + } else { + $filesize = -s "$tmpdir/out$nr.gif"; + } + if ($filesize > $tsize) { + $tsize = $filesize; + $tcount = $nr; + } + } + if ($tcount < 10) { + open( PIPE_OUT, "< $tmpdir/out0$tcount.gif" ); + } else { + open( PIPE_OUT, "< $tmpdir/out$tcount.gif" ); + } + flock( PIPE_OUT, LOCK_EX ); + @stdout = <PIPE_OUT>; + flock( PIPE_OUT, LOCK_UN ); + close PIPE_OUT; + foreach my $nr (0..$filecount) { + if ($nr < 10) { + unlink("$tmpdir/out0$nr.gif"); + } else { + unlink("$tmpdir/out$nr.gif"); + } + } + rmdir($tmpdir); + unlink($tmpfile); + unlink($errfile); + return ( 0, \@stdout, \@stderr ); + } else { + unless (open( PIPE_OUT, "< $tmpfile" ) + and open( PIPE_ERR, "< $errfile" ) ) + { + unless($silent) { + handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) ); + } + unlink($tmpfile); + unlink($errfile); + return ( $?, \@stdout, \@stderr ); + } + flock( PIPE_OUT, LOCK_EX ); + flock( PIPE_ERR, LOCK_EX ); + @stdout = <PIPE_OUT>; + @stderr = <PIPE_ERR>; + flock( PIPE_OUT, LOCK_UN ); + flock( PIPE_ERR, LOCK_UN ); + close(PIPE_OUT); + close(PIPE_ERR); + unlink($tmpfile); + unlink($errfile); + return ( 0, \@stdout, \@stderr ); + } +} + +sub handle_error { + my ( $err_msg, @var_vals ) = @_; + $err_msg = sprintf( $err_msg, @var_vals ); + logfile($err_msg); +} + +sub logfile { + my $logtext = $_[0]; + my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime(time); + my $time = sprintf( + "%4d-%02d-%02d %02d:%02d:%02d", + $year + 1900, + $mon + 1, $mday, $hour, $min, $sec + ); + #$logtext =~ s/\n/\n /g; + #unless ( open LOGFILE, ">> $logfile" ) { + # warn "Can't open $logfile for writing, check permissions"; + #} + #flock( LOGFILE, LOCK_EX ); + #seek( LOGFILE, 0, 2 ); + #print LOGFILE "[$time] $logtext\n"; + #flock( LOGFILE, LOCK_UN ); + #close LOGFILE; + openlog 'SA-FuzzyOCR', 'pid', 'mail'; + for my $line (split /\n/, $logtext) { + syslog 'info', $line; + } + closelog; +} + +sub check_image_hash_db { + my $digest = $_[0]; + my ($gpf, @gcf) = split('::', $digest); + my ($gs, $gh, $gw, $gcn) = split(':', $gpf); + + unless(open(DB, "<$digest_db")) { + debuglog("No Image Hash database found at \"$digest_db\", or permissions wrong."); + return 0; + } + while(<DB>) { + chomp($_); + my ($score, $dpf, @dcf) = split('::', $_); + my ($ds, $dh, $dw, $dcn) = split(':', $dpf); + if ((abs($ds - $gs) / $gs) > $ts) { next; } + if ((abs($dh - $gh) / $gh) > $th) { next; } + if ((abs($dw - $gw) / $gw) > $tw) { next; } + if ((abs($dcn - $gcn) / $gcn) > $tcn) { next; } + + my (@dcfs, @gcfs); + foreach (@dcf) { + push(@dcfs, split(':', $_)); + } + foreach (@gcf) { + push(@gcfs, split(':', $_)); + } + unless (scalar(@gcfs) eq scalar(@dcfs)) { + logfile("Error in database format, aborting..."); + return 0; + } + + foreach (0..scalar(@gcfs) - 1) { + if (abs($dcfs[$_] - $gcfs[$_]) > $tc) { + next; + } + } + return $score; + } + return 0; +} + +sub add_image_hash_db { + my $digest = shift; + my $score = shift; + my $ret; + if (-e $digest_db) { + $ret = open(DB, ">> $digest_db"); + } else { + $ret = open(DB, "> $digest_db"); + debuglog("Image Hash Database not found to add hash, creating it..."); + } + unless ($ret) { + logfile("Unable to open/create Image Hash database at \"$digest_db\", check permissions."); + return; + } + debuglog("Adding hash \"$digest\" to Image Hash database..."); + flock( DB, LOCK_EX ); + seek( DB, 0, 2 ); + print DB "${score}::${digest}\n"; + flock( DB, LOCK_UN ); + close(DB); +} + +sub calc_image_hash { + my ($rcode, $stdout_ref, $stderr_ref); + my $picdata = $_[0]; + my ($hash, $h, $w); + my @ca = ( ); + my $s = length ( $picdata ); + ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$identify -", $picdata); + foreach (@$stdout_ref) { + if ($_ =~ /([0-9]+)x([0-9]+)/) { + $h = $1; + $w = $2; + last; + } + } + if ($rcode) { + debuglog("Unable to calculate image hash, skipping..."); + return ($rcode, $hash); + } + ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$ppmhist -noheader", $picdata); + if ($rcode) { + debuglog("Unable to calculate image hash, skipping..."); + return ($rcode, $hash); + } + my $cnt = 0; + my $c = scalar(@$stdout_ref); + if ($hash_ccnt) { + foreach (@$stdout_ref) { + $_ =~ s/ +/ /g; + my($r, $g, $b, $l, $c) = split(' ', $_); + push(@ca, "::$r:$g:$b:$l:$c"); + $cnt++; + if ($cnt ge $hash_ccnt) { + last; + } + } + } + $hash = "$s:$h:$w:$c" . join('', @ca); + return(0, $hash); +} + +sub debuglog { + if ( $verbose > 1 ) { + logfile("Debug mode: $_[0]"); + } +} + +sub wrong_ctype { + my ( $format, $ctype ) = @_; + if ($wctypescore) { + my $debuginfo = ""; + if ( $verbose > 0 ) { + $debuginfo = + ("Image has format \"$format\" but content-type is \"$ctype\""); + debuglog($debuginfo); + } + for my $set ( 0 .. 3 ) { + $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} = + sprintf( "%0.3f", $wctypescore ); + } + $pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $wctypescore, "BODY: ", + $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE} + . "\n$debuginfo" ); + } +} + +sub corrupt_img { + my ($unfixable, $err) = @_; + my $score = 0; + if ($unfixable) { + $score = $cimgscore2; + } else { + $score = $cimgscore; + } + if ($score) { + my $debuginfo = ""; + if ( $verbose > 0 ) { + chomp($err); + $debuginfo = ("Corrupt image: $err"); + debuglog($debuginfo); + } + for my $set ( 0 .. 3 ) { + $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_CORRUPT_IMG"} = + sprintf( "%0.3f", $score ); + } + $pms->_handle_hit( "FUZZY_OCR_CORRUPT_IMG", $score, "BODY: ", + $pms->{conf}->{descriptions}->{FUZZY_OCR_CORRUPT_IMG} + . "\n$debuginfo" ); + } +} + +sub known_img_hash { + my $digest = shift; + my $score = shift; + my $debuginfo = ""; + if ( $verbose > 0 ) { + $debuginfo = ("Hash \"$digest\" is in the database."); + debuglog($debuginfo); + } + for my $set ( 0 .. 3 ) { + $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} = + sprintf( "%0.3f", $score ); + } + $pms->_handle_hit( "FUZZY_OCR_KNOWN_HASH", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_KNOWN_HASH} . "\n$debuginfo" ); +} + +sub check_fuzzy_ocr { + my @found = (); + my $image_type = 0; + my $picture_data; + my @hashes = (); + my $cnt = 0; + my $homedir = (getpwuid($<))[7]; + + debuglog("Starting FuzzyOcr..."); + debuglog("Attempting to load personal wordlist..."); + + if ($homedir) { + load_personal_words( $homedir . "/$pwordlist" ); + } elsif (defined($ENV{HOME})) { + load_personal_words( $ENV{HOME} . "/$pwordlist" ); + } else { + debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available..."); + } + + foreach my $p ( $pms->{msg}->find_parts(qr/^image\b/i) ) { + my $cscore = $pms->get_score(); + if ( $cscore > $dscore ) { + debuglog( + "Scan canceled, message has already more than $dscore points."); + return 0; + } + my $ctype = $p->{'type'}; + if ( $ctype =~ /image/i ) { + debuglog("Analyzing file with content-type \"$ctype\""); + $picture_data = $p->decode(); + my @used_scansets = (); + my $stdout_ref; + my $stderr_ref; + my $rcode = 0; + my $corrupt = 0; + my $digest; + if ( substr($picture_data,0,3) eq "\x47\x49\x46" ) { + + if ( $ctype !~ /gif/i ) { + wrong_ctype( "GIF", $ctype ); + } + $image_type = 1; + my $interlaced_gif = 0; + my $image_count = 0; + + ( $rcode, $stdout_ref, $stderr_ref ) = + pipe_io( $giftext, $picture_data, 1, 1); + + foreach (@$stdout_ref) { + unless ($interlaced_gif) { + if ( $_ =~ /Image is Interlaced/i ) { + $interlaced_gif = 1; + } + elsif ( $_ =~ /Image is Non Interlaced/i ) { + } + } + if ( $_ =~ /^Image #/ ) { + $image_count++; + } + } + + + if ($interlaced_gif or ($image_count gt 1)) { + debuglog("Image is interlaced or animated..."); + } + else { + debuglog("Image is single non-interlaced..."); + ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io( "$giffix", $picture_data, 0, 1); + $picture_data = join('', @$stdout_ref); + } + + foreach (@$stderr_ref) { + if ( $_ =~ /GIF-LIB error/i ) { + $corrupt = $_; + last; + } + } + + if ($corrupt and ($interlaced_gif or ($image_count gt 1))) { + debuglog("Skipping corrupted interlaced image..."); + corrupt_img(1, $corrupt); + next; + } elsif ($corrupt) { + unless($picture_data) { + debuglog("Uncorrectable corruption detected, skipping non-interlaced image..."); + corrupt_img(1, $corrupt); + next; + } + debuglog("Image is corrupt, but seems fixable, continuing..."); + corrupt_img(0, $corrupt); + } + + if ($image_count gt 1) { + debuglog("File contains more than one image..."); + if ($image_count lt $max_images) { + debuglog("Assembling images..."); + ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$convert - +append -", $picture_data); + if ($rcode) { next; }; + $picture_data = join('', @$stdout_ref); + } elsif ($pre314 eq 0) { + debuglog("Image count exceeds limit, skipping some..."); + ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$gifasm -d \$tmpdir/out", $picture_data, $image_count); + if ($rcode) { next; }; + $picture_data = join('', @$stdout_ref); + } else { + debuglog("Image count exceeds limit, but your version does not allow the required functions, skipping image..."); + next; + } + } + + if ($interlaced_gif) { + ( $rcode, $stdout_ref, $stderr_ref ) = + pipe_io( + "$gifinter -s | $giftopnm -", $picture_data ); + if ($rcode) { next; } + } + else { + ( $rcode, $stdout_ref, $stderr_ref ) = + pipe_io( "$giftopnm -", $picture_data ); + if ($rcode) { next; } + } + } + elsif ( substr($picture_data,0,2) eq "\xff\xd8" ) { + if ( $ctype !~ /jpe{0,1}g/i ) { + wrong_ctype( "JPEG", $ctype ); + } + $image_type = 2; + ( $rcode, $stdout_ref, $stderr_ref ) = + pipe_io( "$jpegtopnm", $picture_data ); + if ($rcode) { next; } + } + elsif ( substr($picture_data,0,4) eq "\x89\x50\x4e\x47" ) { + if ( $ctype !~ /png/i ) { + wrong_ctype( "PNG", $ctype ); + } + $image_type = 3; + ( $rcode, $stdout_ref, $stderr_ref ) = + pipe_io( "$pngtopnm -", $picture_data ); + if ($rcode) { next; } + } + else { + $image_type = 0; + debuglog( +"Image type not recognized, unknown format. Skipping this image..." + ); + next; + } + + debuglog("Recognized file type: $image_type"); + + my @pnmdata = @$stdout_ref; + if($enable_image_hashing) { + debuglog("Calculating the image hash..."); + ($rcode, $digest) = calc_image_hash(join('', @pnmdata)); + if ($rcode) { + debuglog("Error calculating the image hash, skipping hash check..."); + } else { + if (my $score = check_image_hash_db($digest)) { + debuglog("Image found in hash database, message is spam..."); + debuglog("Scoring with known old score and ending..."); + known_img_hash($digest, $score); + return 0; + } + } + debuglog("Hash not yet known to the database, saving for later db storage..."); + push(@hashes, $digest); + } else { + debuglog("Image hashing disabled in configuration, skipping..."); + } + my @ocr_results = (); + + foreach my $scanset (@scansets) { + $scanset =~ s/\$gocr/$gocr/; + ( $rcode, $stdout_ref, $stderr_ref ) = + pipe_io( "$scanset", join( '', @pnmdata ), 1); + + if ($rcode) { + debuglog(join( '', @$stderr_ref )); + debuglog( +"Skipping scanset \"$scanset\" because of errors, trying next..." + ); + next; + } + + my @ocrdata = @$stdout_ref; + push( @ocr_results, [@ocrdata] ); + push( @used_scansets, $scanset ); + } + foreach my $w (@words) { + my $wthreshold; + if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) { + ($w, $wthreshold) = ($1, $2); + } else { + $wthreshold = $threshold; + } + $w =~ s/[^a-zA-Z]//g; + $w = lc $w; + my $wcnt = 0; + my $gcnt = 0; + foreach my $ocr_set (@ocr_results) { + my $cwcnt = 0; + foreach (@$ocr_set) { + tr/!;|081/iiioal/; + s/[^a-zA-Z]//g; + $_ = lc; + my $matched = adistr( $w, $_ ); + if ( abs($matched) < $wthreshold ) { + $cwcnt++; + debuglog( +"Found word \"$w\" in line\n \"$_\" \n with fuzz of " + . abs($matched) + . " scanned with scanset $used_scansets[$gcnt]" + ); + } + } + $wcnt = max( $wcnt, $cwcnt ); + $gcnt++; + } + $cnt += $wcnt; + if ( ( $verbose > 0 ) and ($wcnt) ) { + push( @found, "\"$w\" in $wcnt lines" ); + } + } + } + } + if ( $cnt >= $countreq ) { + my $score = ( $base_score + ( $cnt - $countreq ) * $add_score ); + if($enable_image_hashing and $hashing_learn_scanned) { + debuglog("Message is spam (score $score), storing all image hashes in database..."); + foreach (@hashes) { + add_image_hash_db($_, $score); + } + } else { + debuglog("Message is spam (score $score)..."); + } + my $debuginfo = ""; + if ( $verbose > 0 ) { + $debuginfo = + ( "Words found:\n" + . join( "\n", @found ) + . "\n($cnt word occurrences found)" ); + debuglog($debuginfo); + } + for my $set ( 0 .. 3 ) { + $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR"} = + sprintf( "%0.3f", $score ); + } + $pms->_handle_hit( "FUZZY_OCR", $score, "BODY: ", + $pms->{conf}->{descriptions}->{FUZZY_OCR} . "\n$debuginfo" ); + } + debuglog("FuzzyOcr ending successfully..."); + return 0; +} + +1; |