summaryrefslogtreecommitdiff
path: root/spamassassin/fuzzyocr/FuzzyOcr.pm
diff options
context:
space:
mode:
Diffstat (limited to 'spamassassin/fuzzyocr/FuzzyOcr.pm')
-rw-r--r--spamassassin/fuzzyocr/FuzzyOcr.pm864
1 files changed, 864 insertions, 0 deletions
diff --git a/spamassassin/fuzzyocr/FuzzyOcr.pm b/spamassassin/fuzzyocr/FuzzyOcr.pm
new file mode 100644
index 0000000..5f5f01b
--- /dev/null
+++ b/spamassassin/fuzzyocr/FuzzyOcr.pm
@@ -0,0 +1,864 @@
+# FuzzyOcr plugin, version 2.3b
+# Changelog:
+# version 2.0
+# Replaced imagemagick with netpbm
+# Invoke giffix to fix broken gifs before conversion
+# Support png images
+# Analyze the file to detect the format without content-type
+# Added several configuration parameters
+# version 2.1
+# Added scoring for wrong content-type
+# Added scoring for broken gif images
+# Added configuration for helper applications
+# Added autodisable_score feature to disable the OCR engine if the message has already enough points
+# version 2.1b
+# Rule bugfix to avoid warnings
+# version 2.1c
+# Applied patch provided by Howard Kash to fix problems with spamassassin + Mailscanner + FuzzyOcr
+# Removed '-' from jpegtopnm arguments to provide backwards compatibility for older netpbm versions
+# Fixed typo (treshold -> threshold)
+# version 2.2
+# Small bugfix in content-type check for jpeg (jpg was not matching), thanks to Matthias Keller
+# Added more error handling
+# Removed debug files, added logfile instead
+# More messages with verbose = 2
+# version 2.3
+# Multiple scans with different pnm preprocessing and gocr arguments possible
+# Support for interlaced gifs
+# Support for animated gifs
+# Temporary file handling reorganized
+# External wordlist support
+# Personalized wordlist support
+# Spaces are now stripped from wordlist words and OCR results before matching
+# Experimental MD5 Database feature
+# version 2.3b
+# MD5 Database replaced by different feature database
+# Corrupted images are now handled better
+# Added a timeout function to avoid lockups
+# Added threshold overriding on word basis in wordlist
+# Various bugfixes
+#
+#
+# written by Christian Holler decoder_at_own-hero_dot_net
+
+# syslog support added by weasel, 2006-09-28
+#
+package FuzzyOcr;
+
+use strict;
+use warnings;
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::Util;
+use Mail::SpamAssassin::Plugin;
+use Sys::Syslog;
+
+use String::Approx 'adistr';
+
+use FileHandle;
+use Fcntl ':flock';
+
+our @ISA = qw (Mail::SpamAssassin::Plugin);
+
+our @err_msges = (
+ "Failed to open pipe to external programs with pipe command \"%s\".
+Please check that all helper programs are installed and in the correct path.
+(Pipe Command \"%s\", Pipe exit code %d (\"%s\"), Temporary file: \"%s\")",
+ "Unexpected error in pipe to external programs.
+Please check that all helper programs are installed and in the correct path.
+(Pipe Command \"%s\", Pipe exit code %d (\"%s\"), Temporary file: \"%s\")",
+ "Cannot open \"%s\" to read previously produced data!
+(Previously used pipe: \"%s\", error code %d (\"%s\"), Temporary file: \"%s\")",
+ "Unexpected error while trying executing gocr with arguments \"%s\".
+Make sure the gocr location is specified correctly and the arguments are correct.",
+ "Failed to open global wordlist \"%s\" for reading.
+Please check that path and permissions are correct."
+);
+
+our @words = ();
+our $self;
+our $pms;
+
+# Default values
+our $threshold = "0.3";
+our $base_score = "4";
+our $add_score = "1";
+our $wctypescore = "1.5";
+our $cimgscore = "2.5";
+our $cimgscore2 = "5";
+our $countreq = 2;
+our $verbose = 1;
+our $timeout = 10;
+our $pre314 = 0;
+our $enable_image_hashing = 0;
+our $hashing_learn_scanned = 1;
+our ($ts, $th, $tw, $tcn, $tc, $hash_ccnt) = (0.01, 0.01, 0.01, 0.01, 5, 5);
+our $giffix = "/usr/bin/giffix";
+our $giftext = "/usr/bin/giftext";
+our $gifasm = "/usr/bin/gifasm";
+our $gifinter = "/usr/bin/gifinter";
+our $giftopnm = "/usr/bin/giftopnm";
+our $jpegtopnm = "/usr/bin/jpegtopnm";
+our $pngtopnm = "/usr/bin/pngtopnm";
+our $pnmfile = "/usr/bin/pnmfile";
+our $ppmhist = "/usr/bin/ppmhist";
+our $convert = "/usr/bin/convert";
+our $identify = "/usr/bin/identify";
+our $gocr = "/usr/bin/gocr";
+our $grep = "/bin/grep";
+our $max_images = 5;
+our $dscore = 10;
+our $logfile = "/etc/mail/spamassassin/FuzzyOcr.log";
+our $pwordlist = ".spamassassin/fuzzyocr.words";
+our $digest_db = "/etc/mail/spamassassin/FuzzyOcr.hashdb";
+our @scansets = (
+ '$gocr -i -',
+ '$gocr -l 180 -d 2 -i -'
+);
+
+# constructor: register the eval rule
+sub new {
+ my ( $class, $mailsa ) = @_;
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsa);
+ bless( $self, $class );
+ $self->register_eval_rule("fuzzyocr_check");
+ $self->register_eval_rule("dummy_check");
+ return $self;
+}
+
+sub parse_config {
+ my ( $self, $opts ) = @_;
+ if ( $opts->{key} eq "focr_global_wordlist" ) {
+ load_global_words( $opts->{value} );
+ }
+ elsif ( $opts->{key} eq "focr_personal_wordlist" ) {
+ $pwordlist = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_threshold" ) {
+ $threshold = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_base_score" ) {
+ $base_score = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_add_score" ) {
+ $add_score = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_corrupt_score" ) {
+ $cimgscore = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_corrupt_unfixable_score" ) {
+ $cimgscore2 = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_wrongctype_score" ) {
+ $wctypescore = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_counts_required" ) {
+ $countreq = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_verbose" ) {
+ $verbose = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_timeout" ) {
+ $timeout = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_scansets" ) {
+ parse_scansets( $opts->{value} );
+ }
+ elsif ( $opts->{key} eq "focr_pre314" ) {
+ $pre314 = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_giffix" ) {
+ $giffix = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_giftext" ) {
+ $giftext = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_gifasm" ) {
+ $gifasm = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_gifinter" ) {
+ $gifinter = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_giftopnm" ) {
+ $giftopnm = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_jpegtopnm" ) {
+ $jpegtopnm = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_pngtopnm" ) {
+ $pngtopnm = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_ppmhist" ) {
+ $ppmhist = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_convert" ) {
+ $convert = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_identify" ) {
+ $identify = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_gocr" ) {
+ $gocr = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_bin_grep" ) {
+ $grep = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_gif_max_frames" ) {
+ $max_images = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_autodisable_score" ) {
+ $dscore = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_enable_image_hashing" ) {
+ $enable_image_hashing = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_digest_db" ) {
+ $digest_db = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_hashing_learn_scanned" ) {
+ $hashing_learn_scanned = $opts->{value};
+ }
+ elsif ( $opts->{key} eq "focr_logfile" ) {
+ $logfile = $opts->{value};
+ }
+}
+
+sub dummy_check {
+ return 0;
+}
+
+sub fuzzyocr_check {
+ ( $self, $pms ) = @_;
+ my $t = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
+ $t->run(\&check_fuzzy_ocr);
+ if ($t->timed_out()) {
+ logfile("FuzzyOcr received timeout after running \"$timeout\" seconds.");
+ }
+ return 0;
+}
+
+sub load_global_words {
+ unless ( -r $_[0] ) {
+ handle_error( $err_msges[3], ( $_[0] ) );
+ return;
+ }
+ open WORDLIST, "<$_[0]";
+ while(<WORDLIST>) {
+ chomp($_);
+ if (( $_ =~ /^[ \t]*#.*$/ ) or ( $_ =~ /^[^a-zA-Z]$/ )) {
+ next;
+ }
+ $_ =~ s/[ \t]*#.*$//;
+ push( @words, $_ );
+ }
+ close WORDLIST;
+ return 1;
+}
+
+sub load_personal_words {
+ unless ( -e $_[0] ) {
+ debuglog("No personal wordlist found, skipping...");
+ return;
+ }
+ unless ( -r $_[0] ) {
+ debuglog(
+"Unable to read from wordlist \"$_[0]\", please make sure that permissions are correct."
+ );
+ return;
+ }
+ open WORDLIST, "<$_[0]";
+ while(<WORDLIST>) {
+ chomp($_);
+ if ( $_ =~ /^[ \t]*#.*$/ ) {
+ next;
+ }
+ $_ =~ s/[ \t]*#.*$//;
+ push( @words, $_ );
+ }
+ close WORDLIST;
+}
+
+sub parse_scansets {
+ $_[0] =~ s/,[ ]*/,/g;
+ @scansets = split( ',', $_[0]);
+ debuglog( "Set scansets to values:\n" . join( "\n", @scansets ) );
+ return 1;
+}
+
+sub max {
+ unless ( defined( $_[0] ) and defined( $_[1] ) ) { return 0 }
+ unless ( defined( $_[0] ) ) { return $_[1] }
+ unless ( defined( $_[1] ) ) { return $_[0] }
+ if ( $_[0] < $_[1] ) { return $_[1] }
+ else { return $_[0] }
+}
+
+sub reorder {
+ my $tmp = join( '', @_ );
+ return split( '\n', $tmp );
+}
+
+sub pipe_io {
+ $SIG{PIPE} = 'IGNORE';
+ my $pipecmd = shift;
+ my $input = shift;
+ my $filecount = 0;
+ my $silent = 0;
+ my $ignerror = 0;
+ my $tmpdir;
+ my @stdout = ();
+ my @stderr = ();
+ my ( $tmpfile, $tfilepath ) = Mail::SpamAssassin::Util::secure_tmpfile();
+ my ( $errfile, $efilepath ) = Mail::SpamAssassin::Util::secure_tmpfile();
+ close($tmpfile);
+ close($errfile);
+ if ($tmpfile eq $errfile) {
+ debuglog("Got same tmpfile twice! Aborting pipe_io() to avoid deadlocking");
+ return ( 1, \@stdout, \@stderr );
+ unlink($tmpfile);
+ }
+
+ if($pipecmd =~ /\$tmpdir/) {
+ $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir();
+ $pipecmd =~ s/\$tmpdir/$tmpdir/g;
+ $filecount = shift;
+ } else {
+ $silent = shift;
+ $ignerror = shift;
+ }
+
+ $pipecmd =~ s/\$errfile/$errfile/g;
+ my $pipe_pid = open( PIPE_IN, "| $pipecmd 1>$tmpfile 2>>$errfile" );
+
+ unless ($pipe_pid) {
+ unless($silent) {
+ handle_error( $err_msges[0], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
+ }
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( $?, \@stdout, \@stderr );
+ }
+ flock( PIPE_IN, LOCK_EX );
+ print PIPE_IN $input;
+ flock( PIPE_IN, LOCK_UN );
+ close(PIPE_IN);
+ if ($? and not $ignerror) {
+ unless($silent) {
+ handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
+ }
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( $?, \@stdout, \@stderr );
+ }
+ if ($filecount) {
+ my $tsize = 0;
+ my $tcount = 0;
+ foreach my $nr (0..$filecount-1) {
+ my $filesize = 0;
+ if ($nr < 10) {
+ $filesize = -s "$tmpdir/out0$nr.gif";
+ } else {
+ $filesize = -s "$tmpdir/out$nr.gif";
+ }
+ if ($filesize > $tsize) {
+ $tsize = $filesize;
+ $tcount = $nr;
+ }
+ }
+ if ($tcount < 10) {
+ open( PIPE_OUT, "< $tmpdir/out0$tcount.gif" );
+ } else {
+ open( PIPE_OUT, "< $tmpdir/out$tcount.gif" );
+ }
+ flock( PIPE_OUT, LOCK_EX );
+ @stdout = <PIPE_OUT>;
+ flock( PIPE_OUT, LOCK_UN );
+ close PIPE_OUT;
+ foreach my $nr (0..$filecount) {
+ if ($nr < 10) {
+ unlink("$tmpdir/out0$nr.gif");
+ } else {
+ unlink("$tmpdir/out$nr.gif");
+ }
+ }
+ rmdir($tmpdir);
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( 0, \@stdout, \@stderr );
+ } else {
+ unless (open( PIPE_OUT, "< $tmpfile" )
+ and open( PIPE_ERR, "< $errfile" ) )
+ {
+ unless($silent) {
+ handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
+ }
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( $?, \@stdout, \@stderr );
+ }
+ flock( PIPE_OUT, LOCK_EX );
+ flock( PIPE_ERR, LOCK_EX );
+ @stdout = <PIPE_OUT>;
+ @stderr = <PIPE_ERR>;
+ flock( PIPE_OUT, LOCK_UN );
+ flock( PIPE_ERR, LOCK_UN );
+ close(PIPE_OUT);
+ close(PIPE_ERR);
+ unlink($tmpfile);
+ unlink($errfile);
+ return ( 0, \@stdout, \@stderr );
+ }
+}
+
+sub handle_error {
+ my ( $err_msg, @var_vals ) = @_;
+ $err_msg = sprintf( $err_msg, @var_vals );
+ logfile($err_msg);
+}
+
+sub logfile {
+ my $logtext = $_[0];
+ my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime(time);
+ my $time = sprintf(
+ "%4d-%02d-%02d %02d:%02d:%02d",
+ $year + 1900,
+ $mon + 1, $mday, $hour, $min, $sec
+ );
+ #$logtext =~ s/\n/\n /g;
+ #unless ( open LOGFILE, ">> $logfile" ) {
+ # warn "Can't open $logfile for writing, check permissions";
+ #}
+ #flock( LOGFILE, LOCK_EX );
+ #seek( LOGFILE, 0, 2 );
+ #print LOGFILE "[$time] $logtext\n";
+ #flock( LOGFILE, LOCK_UN );
+ #close LOGFILE;
+ openlog 'SA-FuzzyOCR', 'pid', 'mail';
+ for my $line (split /\n/, $logtext) {
+ syslog 'info', $line;
+ }
+ closelog;
+}
+
+sub check_image_hash_db {
+ my $digest = $_[0];
+ my ($gpf, @gcf) = split('::', $digest);
+ my ($gs, $gh, $gw, $gcn) = split(':', $gpf);
+
+ unless(open(DB, "<$digest_db")) {
+ debuglog("No Image Hash database found at \"$digest_db\", or permissions wrong.");
+ return 0;
+ }
+ while(<DB>) {
+ chomp($_);
+ my ($score, $dpf, @dcf) = split('::', $_);
+ my ($ds, $dh, $dw, $dcn) = split(':', $dpf);
+ if ((abs($ds - $gs) / $gs) > $ts) { next; }
+ if ((abs($dh - $gh) / $gh) > $th) { next; }
+ if ((abs($dw - $gw) / $gw) > $tw) { next; }
+ if ((abs($dcn - $gcn) / $gcn) > $tcn) { next; }
+
+ my (@dcfs, @gcfs);
+ foreach (@dcf) {
+ push(@dcfs, split(':', $_));
+ }
+ foreach (@gcf) {
+ push(@gcfs, split(':', $_));
+ }
+ unless (scalar(@gcfs) eq scalar(@dcfs)) {
+ logfile("Error in database format, aborting...");
+ return 0;
+ }
+
+ foreach (0..scalar(@gcfs) - 1) {
+ if (abs($dcfs[$_] - $gcfs[$_]) > $tc) {
+ next;
+ }
+ }
+ return $score;
+ }
+ return 0;
+}
+
+sub add_image_hash_db {
+ my $digest = shift;
+ my $score = shift;
+ my $ret;
+ if (-e $digest_db) {
+ $ret = open(DB, ">> $digest_db");
+ } else {
+ $ret = open(DB, "> $digest_db");
+ debuglog("Image Hash Database not found to add hash, creating it...");
+ }
+ unless ($ret) {
+ logfile("Unable to open/create Image Hash database at \"$digest_db\", check permissions.");
+ return;
+ }
+ debuglog("Adding hash \"$digest\" to Image Hash database...");
+ flock( DB, LOCK_EX );
+ seek( DB, 0, 2 );
+ print DB "${score}::${digest}\n";
+ flock( DB, LOCK_UN );
+ close(DB);
+}
+
+sub calc_image_hash {
+ my ($rcode, $stdout_ref, $stderr_ref);
+ my $picdata = $_[0];
+ my ($hash, $h, $w);
+ my @ca = ( );
+ my $s = length ( $picdata );
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$identify -", $picdata);
+ foreach (@$stdout_ref) {
+ if ($_ =~ /([0-9]+)x([0-9]+)/) {
+ $h = $1;
+ $w = $2;
+ last;
+ }
+ }
+ if ($rcode) {
+ debuglog("Unable to calculate image hash, skipping...");
+ return ($rcode, $hash);
+ }
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$ppmhist -noheader", $picdata);
+ if ($rcode) {
+ debuglog("Unable to calculate image hash, skipping...");
+ return ($rcode, $hash);
+ }
+ my $cnt = 0;
+ my $c = scalar(@$stdout_ref);
+ if ($hash_ccnt) {
+ foreach (@$stdout_ref) {
+ $_ =~ s/ +/ /g;
+ my($r, $g, $b, $l, $c) = split(' ', $_);
+ push(@ca, "::$r:$g:$b:$l:$c");
+ $cnt++;
+ if ($cnt ge $hash_ccnt) {
+ last;
+ }
+ }
+ }
+ $hash = "$s:$h:$w:$c" . join('', @ca);
+ return(0, $hash);
+}
+
+sub debuglog {
+ if ( $verbose > 1 ) {
+ logfile("Debug mode: $_[0]");
+ }
+}
+
+sub wrong_ctype {
+ my ( $format, $ctype ) = @_;
+ if ($wctypescore) {
+ my $debuginfo = "";
+ if ( $verbose > 0 ) {
+ $debuginfo =
+ ("Image has format \"$format\" but content-type is \"$ctype\"");
+ debuglog($debuginfo);
+ }
+ for my $set ( 0 .. 3 ) {
+ $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} =
+ sprintf( "%0.3f", $wctypescore );
+ }
+ $pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $wctypescore, "BODY: ",
+ $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE}
+ . "\n$debuginfo" );
+ }
+}
+
+sub corrupt_img {
+ my ($unfixable, $err) = @_;
+ my $score = 0;
+ if ($unfixable) {
+ $score = $cimgscore2;
+ } else {
+ $score = $cimgscore;
+ }
+ if ($score) {
+ my $debuginfo = "";
+ if ( $verbose > 0 ) {
+ chomp($err);
+ $debuginfo = ("Corrupt image: $err");
+ debuglog($debuginfo);
+ }
+ for my $set ( 0 .. 3 ) {
+ $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_CORRUPT_IMG"} =
+ sprintf( "%0.3f", $score );
+ }
+ $pms->_handle_hit( "FUZZY_OCR_CORRUPT_IMG", $score, "BODY: ",
+ $pms->{conf}->{descriptions}->{FUZZY_OCR_CORRUPT_IMG}
+ . "\n$debuginfo" );
+ }
+}
+
+sub known_img_hash {
+ my $digest = shift;
+ my $score = shift;
+ my $debuginfo = "";
+ if ( $verbose > 0 ) {
+ $debuginfo = ("Hash \"$digest\" is in the database.");
+ debuglog($debuginfo);
+ }
+ for my $set ( 0 .. 3 ) {
+ $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} =
+ sprintf( "%0.3f", $score );
+ }
+ $pms->_handle_hit( "FUZZY_OCR_KNOWN_HASH", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_KNOWN_HASH} . "\n$debuginfo" );
+}
+
+sub check_fuzzy_ocr {
+ my @found = ();
+ my $image_type = 0;
+ my $picture_data;
+ my @hashes = ();
+ my $cnt = 0;
+ my $homedir = (getpwuid($<))[7];
+
+ debuglog("Starting FuzzyOcr...");
+ debuglog("Attempting to load personal wordlist...");
+
+ if ($homedir) {
+ load_personal_words( $homedir . "/$pwordlist" );
+ } elsif (defined($ENV{HOME})) {
+ load_personal_words( $ENV{HOME} . "/$pwordlist" );
+ } else {
+ debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available...");
+ }
+
+ foreach my $p ( $pms->{msg}->find_parts(qr/^image\b/i) ) {
+ my $cscore = $pms->get_score();
+ if ( $cscore > $dscore ) {
+ debuglog(
+ "Scan canceled, message has already more than $dscore points.");
+ return 0;
+ }
+ my $ctype = $p->{'type'};
+ if ( $ctype =~ /image/i ) {
+ debuglog("Analyzing file with content-type \"$ctype\"");
+ $picture_data = $p->decode();
+ my @used_scansets = ();
+ my $stdout_ref;
+ my $stderr_ref;
+ my $rcode = 0;
+ my $corrupt = 0;
+ my $digest;
+ if ( substr($picture_data,0,3) eq "\x47\x49\x46" ) {
+
+ if ( $ctype !~ /gif/i ) {
+ wrong_ctype( "GIF", $ctype );
+ }
+ $image_type = 1;
+ my $interlaced_gif = 0;
+ my $image_count = 0;
+
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( $giftext, $picture_data, 1, 1);
+
+ foreach (@$stdout_ref) {
+ unless ($interlaced_gif) {
+ if ( $_ =~ /Image is Interlaced/i ) {
+ $interlaced_gif = 1;
+ }
+ elsif ( $_ =~ /Image is Non Interlaced/i ) {
+ }
+ }
+ if ( $_ =~ /^Image #/ ) {
+ $image_count++;
+ }
+ }
+
+
+ if ($interlaced_gif or ($image_count gt 1)) {
+ debuglog("Image is interlaced or animated...");
+ }
+ else {
+ debuglog("Image is single non-interlaced...");
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io( "$giffix", $picture_data, 0, 1);
+ $picture_data = join('', @$stdout_ref);
+ }
+
+ foreach (@$stderr_ref) {
+ if ( $_ =~ /GIF-LIB error/i ) {
+ $corrupt = $_;
+ last;
+ }
+ }
+
+ if ($corrupt and ($interlaced_gif or ($image_count gt 1))) {
+ debuglog("Skipping corrupted interlaced image...");
+ corrupt_img(1, $corrupt);
+ next;
+ } elsif ($corrupt) {
+ unless($picture_data) {
+ debuglog("Uncorrectable corruption detected, skipping non-interlaced image...");
+ corrupt_img(1, $corrupt);
+ next;
+ }
+ debuglog("Image is corrupt, but seems fixable, continuing...");
+ corrupt_img(0, $corrupt);
+ }
+
+ if ($image_count gt 1) {
+ debuglog("File contains more than one image...");
+ if ($image_count lt $max_images) {
+ debuglog("Assembling images...");
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$convert - +append -", $picture_data);
+ if ($rcode) { next; };
+ $picture_data = join('', @$stdout_ref);
+ } elsif ($pre314 eq 0) {
+ debuglog("Image count exceeds limit, skipping some...");
+ ( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$gifasm -d \$tmpdir/out", $picture_data, $image_count);
+ if ($rcode) { next; };
+ $picture_data = join('', @$stdout_ref);
+ } else {
+ debuglog("Image count exceeds limit, but your version does not allow the required functions, skipping image...");
+ next;
+ }
+ }
+
+ if ($interlaced_gif) {
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io(
+ "$gifinter -s | $giftopnm -", $picture_data );
+ if ($rcode) { next; }
+ }
+ else {
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( "$giftopnm -", $picture_data );
+ if ($rcode) { next; }
+ }
+ }
+ elsif ( substr($picture_data,0,2) eq "\xff\xd8" ) {
+ if ( $ctype !~ /jpe{0,1}g/i ) {
+ wrong_ctype( "JPEG", $ctype );
+ }
+ $image_type = 2;
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( "$jpegtopnm", $picture_data );
+ if ($rcode) { next; }
+ }
+ elsif ( substr($picture_data,0,4) eq "\x89\x50\x4e\x47" ) {
+ if ( $ctype !~ /png/i ) {
+ wrong_ctype( "PNG", $ctype );
+ }
+ $image_type = 3;
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( "$pngtopnm -", $picture_data );
+ if ($rcode) { next; }
+ }
+ else {
+ $image_type = 0;
+ debuglog(
+"Image type not recognized, unknown format. Skipping this image..."
+ );
+ next;
+ }
+
+ debuglog("Recognized file type: $image_type");
+
+ my @pnmdata = @$stdout_ref;
+ if($enable_image_hashing) {
+ debuglog("Calculating the image hash...");
+ ($rcode, $digest) = calc_image_hash(join('', @pnmdata));
+ if ($rcode) {
+ debuglog("Error calculating the image hash, skipping hash check...");
+ } else {
+ if (my $score = check_image_hash_db($digest)) {
+ debuglog("Image found in hash database, message is spam...");
+ debuglog("Scoring with known old score and ending...");
+ known_img_hash($digest, $score);
+ return 0;
+ }
+ }
+ debuglog("Hash not yet known to the database, saving for later db storage...");
+ push(@hashes, $digest);
+ } else {
+ debuglog("Image hashing disabled in configuration, skipping...");
+ }
+ my @ocr_results = ();
+
+ foreach my $scanset (@scansets) {
+ $scanset =~ s/\$gocr/$gocr/;
+ ( $rcode, $stdout_ref, $stderr_ref ) =
+ pipe_io( "$scanset", join( '', @pnmdata ), 1);
+
+ if ($rcode) {
+ debuglog(join( '', @$stderr_ref ));
+ debuglog(
+"Skipping scanset \"$scanset\" because of errors, trying next..."
+ );
+ next;
+ }
+
+ my @ocrdata = @$stdout_ref;
+ push( @ocr_results, [@ocrdata] );
+ push( @used_scansets, $scanset );
+ }
+ foreach my $w (@words) {
+ my $wthreshold;
+ if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) {
+ ($w, $wthreshold) = ($1, $2);
+ } else {
+ $wthreshold = $threshold;
+ }
+ $w =~ s/[^a-zA-Z]//g;
+ $w = lc $w;
+ my $wcnt = 0;
+ my $gcnt = 0;
+ foreach my $ocr_set (@ocr_results) {
+ my $cwcnt = 0;
+ foreach (@$ocr_set) {
+ tr/!;|081/iiioal/;
+ s/[^a-zA-Z]//g;
+ $_ = lc;
+ my $matched = adistr( $w, $_ );
+ if ( abs($matched) < $wthreshold ) {
+ $cwcnt++;
+ debuglog(
+"Found word \"$w\" in line\n \"$_\" \n with fuzz of "
+ . abs($matched)
+ . " scanned with scanset $used_scansets[$gcnt]"
+ );
+ }
+ }
+ $wcnt = max( $wcnt, $cwcnt );
+ $gcnt++;
+ }
+ $cnt += $wcnt;
+ if ( ( $verbose > 0 ) and ($wcnt) ) {
+ push( @found, "\"$w\" in $wcnt lines" );
+ }
+ }
+ }
+ }
+ if ( $cnt >= $countreq ) {
+ my $score = ( $base_score + ( $cnt - $countreq ) * $add_score );
+ if($enable_image_hashing and $hashing_learn_scanned) {
+ debuglog("Message is spam (score $score), storing all image hashes in database...");
+ foreach (@hashes) {
+ add_image_hash_db($_, $score);
+ }
+ } else {
+ debuglog("Message is spam (score $score)...");
+ }
+ my $debuginfo = "";
+ if ( $verbose > 0 ) {
+ $debuginfo =
+ ( "Words found:\n"
+ . join( "\n", @found )
+ . "\n($cnt word occurrences found)" );
+ debuglog($debuginfo);
+ }
+ for my $set ( 0 .. 3 ) {
+ $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR"} =
+ sprintf( "%0.3f", $score );
+ }
+ $pms->_handle_hit( "FUZZY_OCR", $score, "BODY: ",
+ $pms->{conf}->{descriptions}->{FUZZY_OCR} . "\n$debuginfo" );
+ }
+ debuglog("FuzzyOcr ending successfully...");
+ return 0;
+}
+
+1;