Do spam classification right here, 2

git-svn-id: svn+ssh://asteria.noreply.org/svn/weaselutils/trunk@129 bc3d92e2-beff-0310-a7cd-cc87d7ac0ede
author: Peter Palfrader <peter@palfrader.org> 2006-06-15 21:21:35 +0000
committer: weasel <weasel@bc3d92e2-beff-0310-a7cd-cc87d7ac0ede> 2006-06-15 21:21:35 +0000
commit: 90e27cc42fd410bd4cc28693214be37c1484aad6 (patch)
tree: 34e0c18d75f3e4db5267acc85ed1262292733ab3
parent: eef706ac7e1dfdaa6e61108f9e2cffba611ebda8 (diff)
1 files changed, 23 insertions, 18 deletions
diff --git a/split-mailman-mails-and-discard-and-save b/split-mailman-mails-and-discard-and-save
index 61298cc..7fc175f 100755
--- a/split-mailman-mails-and-discard-and-save
+++ b/split-mailman-mails-and-discard-and-save
@@ -16,22 +16,22 @@ end
 
 
 
-OUTBOX="mail/outbox"
+OUTBOX    = "mail/outbox"
+SPAMLEARN = "mail/spam-learn"
+HAMLEARN  = "mail/ham-learn"
 check_maildir OUTBOX
+check_maildir SPAMLEARN
+check_maildir HAMLEARN
 APPROVE_PASSWORD = YAML::load( File.open( 'mailman-passwords.yaml' ) )
 
 if ARGV[0] == "spam"
 	ACTION    = "spam"
 	MAILIN    = "mail/spam-in"
-	MAILLEARN = "mail/spam-learn"
 	check_maildir MAILIN
-	check_maildir MAILLEARN
 elsif ARGV[0] == "ham"
 	ACTION    = "ham"
 	MAILIN    = "mail/ham-in"
-	MAILLEARN = "mail/ham-learn"
 	check_maildir MAILIN
-	check_maildir MAILLEARN
 elsif ARGV[0] == "classify"
 	ACTION    = "classify"
 	MAILIN    = "mail/mailman-moderator-requests"
@@ -45,7 +45,7 @@ elsif ARGV[0] == "classify"
 	check_maildir MAIL_PROCESSED_HAM
 	check_maildir MAIL_PROCESSED_FORWARDED
 else
-	STDERR.puts "Usage: #{$0} ham|spam"
+	STDERR.puts "Usage: #{$0} ham|spam|classify"
 	exit 1
 end
 
@@ -119,6 +119,7 @@ def runcmd(command, input)
 		wrin.close
 		rdout.close
 		rderr.close
+		STDIN.reopen rdin
 		STDOUT.reopen wrout
 		STDERR.reopen wrerr
 		exec(*command)
@@ -130,7 +131,7 @@ def runcmd(command, input)
 
 	out = []
 	err = []
-	tin  = Thread.new { wrin.print input }
+	tin  = Thread.new { wrin.print input; wrin.close }
 	tout = Thread.new { out = rdout.readlines }
 	terr = Thread.new { err = rderr.readlines }
 	tin.join
@@ -146,8 +147,8 @@ end
 def runnoerrors(command, input)
 	exitstatus, out, err = runcmd(command, input)
 	cmd = command.join(' ')
-	throw "command '#{cmd}' returned with non-zero exit status #{exitstatus}"
-	throw "command '#{cmd}' returned with output on stderr: #{err.join}"
+	throw "command '#{cmd}' returned with non-zero exit status #{exitstatus}" if exitstatus != 0
+	throw "command '#{cmd}' returned with output on stderr: #{err.join}" if err.length > 0
 
 	out
 end
@@ -159,11 +160,11 @@ def sa_check(message)
 	throw "Could not find score in spamassassin output line1: '#{line1}'" unless matchdata and matchdata[1]
 	score = matchdata[1].to_f
 
-	c = (score < 1.0) ? "ham" :
-	    (score > 6.0) ? "spam" :
-	                    "unsure"
+	c = (score < 1.0) ? "Ham" :
+	    (score > 6.0) ? "Spam" :
+	                    "Unsure"
 	
-	[c, out.join]
+	[c, out.join, score]
 end
 
 def bogo_check(message)
@@ -203,19 +204,23 @@ def process_mail(filename)
 
 
 	if ACTION == "ham"
-		store_in_maildir(MAILLEARN, held_part)
+		store_in_maildir(HAMLEARN, held_part)
 		approve(cookie, request_address)
 	elsif ACTION == "spam"
-		store_in_maildir(MAILLEARN, held_part)
+		store_in_maildir(SPAMLEARN, held_part)
 		discard(cookie, request_address)
 	elsif ACTION == "classify"
-		sa_class  , sa_text   = sa_check(held_part)
+		sa_class  , sa_text  , sa_score = sa_check(held_part)
 		bogo_class, bogo_text = bogo_check(held_part)
 
-		if sa_class == "ham" and bogo_class == "ham"
+		if sa_class == "Ham" and bogo_class == "Ham"
 			store_in_maildir(MAIL_PROCESSED_HAM, message)
 			approve(cookie, request_address)
-		elsif sa_class == "spam" and bogo_class == "spam"
+		elsif sa_class == "Spam" and bogo_class == "Spam"
+			store_in_maildir(MAIL_PROCESSED_SPAM, message)
+			discard(cookie, request_address)
+		elsif sa_class == "Spam" and sa_score > 10 # but bogo did not match
+			store_in_maildir(SPAMLEARN, held_part) # so we let it learn it
 			store_in_maildir(MAIL_PROCESSED_SPAM, message)
 			discard(cookie, request_address)
 		else
author	Peter Palfrader <peter@palfrader.org>	2006-06-15 21:21:35 +0000
committer	weasel <weasel@bc3d92e2-beff-0310-a7cd-cc87d7ac0ede>	2006-06-15 21:21:35 +0000
commit	90e27cc42fd410bd4cc28693214be37c1484aad6 (patch)
tree	34e0c18d75f3e4db5267acc85ed1262292733ab3
parent	eef706ac7e1dfdaa6e61108f9e2cffba611ebda8 (diff)