#!/usr/bin/crm
#
#        a whitelist / blacklist / command based mail sorter
#
#	Designed to be used with procmail or to be used as a filter
#	with a MTA.  Inserts a header into the message that can be
#	used with MUA.
#
##############################################################
#   for testing purposes, log all incoming mail
output [allmail.txt] <append> /:*:_dw:/
#
#
#    ---  include the configuration file  ---
insert mailfilterconfig.crm
#
#    --- make a safe space to keep the results of our work ---
#
isolate (:classifier_reason:)
alter (:classifier_reason:) /no reason yet/
#
isolate (:our_exit_code:)
alter (:our_exit_code:) /0/
#
#
isolate (:m_text:)
alter (:m_text:) //
#
# -------------check for the COMMAND WORD ----------
#
{
    #
    #    grab the command word as :c:, password as :pw:, and any arg(s) as :a:
    #
    match <nomultiline> (:z: :pw: :c: ) [:_dw:] /^command ([[:graph:]]+) (.*)/
    #
    #    check the password.  If it's invalid, FAIL out of this bracket set
    #    and just treat this as ordinary (non-command) mail.
    match [:pw:] /:*:spw:/			
    {
	#    was it a command to add something to the whitelist?
	match <nomultiline> (:q: :a:) [:c:] /whitelist (.*)/
	output [whitelist.mfp] <append> /:*:a::*:_nl:/
	alter (:z:) /*** :*:z: *** :*:_nl:Whitelist command executed! :*:_nl:/
	accept
	exit /0/
    }
    {
	#    was it a command to add something to the blacklist?
	match <nomultiline> (:q: :a:) [:c:] /blacklist (.*)/
	output [blacklist.mfp] <append> /:*:a::*:_nl:/
	alter (:z:) /*** :*:z: *** :*:_nl:Blacklist command executed! :*:_nl:/
	accept
	exit /0/
    }
    {
	#     was it a command to learn something as nonspam?
	match [:c:] /nonspam/
	match (:z: :text:) [:_dw:] /:*:_nl:command [[:graph:]]+ nonspam(.*)/
	#      and learn it as nonspam
	output [nonspamtext.txt] <append> /:*:text:/
	learn (nonspam.css) [:text:] /:*:lcr:/
	alter (:z:) /:*:_nl:*** Learn nonspam executed! *** :*:_nl:  :*:z:/
	accept
	exit /0/
    }
    {
	#     was it a command to learn something as spam?
	match [:c:] /spam/
	match (:z: :text:) [:_dw:] /:*:_nl:command [[:graph:]]+ spam(.*)/
	#      and learn it as spam
	output [spamtext.txt] <append> /:*:text:/
	learn (spam.css) [:text:] /:*:lcr:/
	alter (:z:) /:*:_nl:*** Learn spam executed! *** :*:_nl:  :*:z:/
	accept
	exit /0/
    }
    {
	#     was it a command to learn something as an arbitrary type?
	# Note: the files this generates don't get used for anything yet.
	#
	match [:c:] /learn/
	match (:z: :file: :text:) [:_dw:] /:*:_nl:command [[:graph:]]+ learn ([[:graph:]]+)(.*)/
	#      and learn it
	output [:*:file:.txt] <append> /:*:text:/
	learn (:*:file:.css) [:text:] /:*:lcr:/
	alter (:z:) /:*:_nl:*** Learn to class :*:file: executed! ***:*:_nl:  :*:z:/
	accept
	exit /0/
    }

}
#     none of the above - classify this incoming mail instead.
#     first according to priority action list,
#     then according to whitelist,
#     then according to blacklist,
#     then according to the CRM sparse spectral classifier.
#   
#     check it against the priority action list- this list is 
#     of the form of a + or -, then a pattern.  + means accept,
#     - means reject.  These are executed in order (which is 
#     different from whitelist or blacklist in that they occur
#     in order given, not whitelist-then-blacklist.  The priority
#     action list is tried before whitelist or blacklist.
#
isolate (:priolist:)
input (:priolist:) [priolist.mfp]
#    reset matching on :priolist: to the start of the string
match [:priolist:] //
#
#     
{
	#... Grab the next regexturn the one-per-line patterns into a regex
	match <fromend nomultiline> (:w: :pm: :pr:) [:priolist:]  /(.)(.+)/
	#... see if this regex matches the incoming mail
	{
		match <nomultiline nocase> (:reason:) /:*:pr:/
		#  Yep, it matched... branch based on pm
		#
		{
			match [:pm:] /[+]/
			# put in a little tag saying why prio-listed
			alter (:classifier_reason:) /ACCEPT: CRM114 Priority Whitelisted by: :*:reason: **:*:_nl:/
			output [accepted_by_whitelist.txt] <append> /X-CRM114-Status: :*:classifier_reason::*:_dw:/
			goto /:looks_good:/
		}	
		#   No, we didn't have a +, so it was a - and we reject.  
		alter (:classifier_reason:) /REJECT: CRM114 Priority Blacklisted by: :*:reason: **:*:_nl:/
		output [rejected_by_blacklist.txt] <append> /X-CRM114-Status: :*:classifier_reason::*:_dw:/
		goto /:looks_bad:/
		#
		#
	}
	#   Nope, didn't match as a priorityk... grab the next regex
	liaf
}
#
#
#     check it against the whitelist... load the whitelist...
isolate (:whitelist:)
input (:whitelist:) [whitelist.mfp]
#    reset matching on whitelist to start of string
match [:whitelist:] //
#
#     
{
	#... Grab the next regexturn the one-per-line patterns into a regex
	match <fromend nomultiline> (:waste: :whregex:) [:whitelist:]  /(.+)/
	#... see if this regex matches the incoming mail
	{
		match <nomultiline nocase> (:reason:) /:*:whregex:/
		#  Yep, it matched... whitelist this email
		#
		# put in a little tag saying why whitelisted:
		alter (:classifier_reason:) /ACCEPT: CRM114 Whitelisted by: :*:reason: **:*:_nl:/
		output [accepted_by_whitelist.txt] <append> /X-CRM114-Status: :*:classifier_reason::*:_dw:/
		goto /:looks_good:/
	}
	#   Nope, didn't match... grab the next regex and try again,
	liaf
}

#
#    No joy, maybe we should blacklist it.
#
#     check it against the blacklist
isolate (:blacklist:)
input (:blacklist:) [blacklist.mfp]
#    reset matching on blacklist to start of string
match [:blacklist:] //
#     
{
	#... Grab the next regexturn the one-per-line patterns into a regex
	match <fromend nomultiline> (:waste: :blregex:) [:blacklist:]  /(.+)/
	#... see if this regex matches the incoming mail
	{
		match <nomultiline nocase> (:reason:) /:*:blregex:/
		#  Yep, it matched... blacklist this email
		#
		# put in a little tag saying why blacklisted
		alter (:classifier_reason:) /REJECT: CRM114 Blacklisted by: :*:reason: ** :*:_nl:/
		output [rejected_by_blacklist.txt] <append> /X-CRM114-Status: :*:classifier_reason::*:_dw:/
		goto /:looks_bad:/
	}
	#   Nope, didn't match... grab the next regex and try again
	liaf
}
#
#
#       OK, it wasn't a command, wasn't whitelisted, wasn't blacklisted,
#	so it's time to pull out the learning heuristics and let 'er rip.
{
	#   m_text is "mutilated text" - the result of all our
	#  machinations and hackages.  It's a _copy_ of the incoming 
	#  text, for mutilation
	alter (:m_text:) /:*:_dw:/

	#   expansion 1: - do we perform mime expansions?
	{
	    match [:do_normalize:] /yes/
	    #
	    # Use external program to decode mime, remove most
	    # headers, and change the charset to UTF-8
	    #
	    isolate (:exp_text:)
	    syscall (:*:m_text:) (:exp_text:) /:*:mime_decoder:/
	    alter (:m_text:) /:*:exp_text:/
	}
	#
	#    all done


	# Run the CSS classifier against the "expanded" text -
	# if it classifies as SPAM
	#   then output into "rejected.txt".  
	#
	isolate (:stats:)
	{
		classify ( nonspam.css | spam.css ) ( :stats: ) [:m_text:] /:*:lcr:/
		alter (:classifier_reason:) /ACCEPT: CRM114 PASS SBPH\/BCR TEST** :*:_nl::*:stats:/
		output [accepted_by_css.txt] <append> /:*:_dw::*:_nl::*:classifier_reason::*:_nl: - armor-pierced and decommented text was -:*:_nl::*:m_text::*:_nl:-0-0-0-0-0-0-0-0-0-0-0-0-0-:*:_nl::*:_nl:/
		goto /:looks_good:/
	}
	alter (:classifier_reason:) /REJECT: CRM114 FAIL SBPH\/BCR TEST** :*:_nl::*:stats:/
	output [rejected_by_css.txt] <append> /:*:_dw::*:_nl::*:classifier_reason::*:_nl: - armor-pierced and decommented text was -:*:_nl::*:m_text::*:_nl:-0-0-0-0-0-0-0-0-0-0-0-0-0-:*:_nl::*:_nl:/
	goto /:looks_bad:/
}
#
#
#         Final wrap-up routines - dispose of the mail as appropriate.
#
{
	:looks_bad:

	#
	# Remove line feeds from the status message
	#
	isolate (:header_reason:)
	alter (:header_reason:) /:*:classifier_reason:/
	{
		match [:header_reason:] (:n:) /:*:_nl:+/
		alter (:n:) / /
		liaf
	}
	#
	# Insert the status message as a header, right after the
	# first line (the From line) of the headers.  This assumes
	# we have more than one header line.
	#
	{
		match [:_dw:] (:secondlinestart:) /:*:_nl:[[:alpha:]]/
		alter (:secondlinestart:) /:*:_nl:X-CRM114-Status: :*:header_reason::*:secondlinestart:/
	}
	accept
	alter (:our_exit_code:) /:*:rejected_mail_exit_code:/
	goto /:finish_up:/
}
#
#          and here's where we accept something as good email.
{
	:looks_good:

	#
	# Remove line feeds from the status message
	#
	isolate (:header_reason:)
	alter (:header_reason:) /:*:classifier_reason:/
	{
		match [:header_reason:] (:n:) /:*:_nl:+/
		alter (:n:) / /
		liaf
	}
	#
	# Insert the status message as a header, right after the
	# first line (the From line) of the headers.  This assumes
	# we have more than one header line.
	#
	{
		match [:_dw:] (:secondlinestart:) /:*:_nl:[[:alpha:]]/
		alter (:secondlinestart:) /:*:_nl:X-CRM114-Status: :*:header_reason::*:secondlinestart:/
	}
	accept
	alter (:our_exit_code:) /0/
	goto /:finish_up:/
}
#
#         Here's where we finish up processing in all the paths.
#         
:finish_up:
{
#    ---- should we consider automatic training?	
	match [:automatic_training:] /yes/
	# bounce out if we've already auto-trained this email
	match <absent> /AUTO-TRAINED/
	isolate (:msghash:)
	hash (:msghash:) /:*:_dw:/
	#        pick one in 16- here, if the second-to-last digit is a 0
	match [:msghash:] /......0./
	#
	# out put autotraining...
	#       Yep... we should use this for autotraining
	#       do we auto-train on acceptance? 
	{ 
		match [:classifier_reason:] /^ACCEPT/
		#   it wasn't spam... autotrain it "nonspam"
		output [nonspamtext.txt] <append> /:*:text:/
		learn (nonspam.css) [:m_text:]  /:*:lcr:/
		goto /:autotrain_finish:/
	}

	#      or do we autotran on rejection       
	{ 
		match [:classifier_reason:] /^REJECT/
		#   it was spam... autotrain it "spam"
		output [spamtext.txt] <append> /:*:text:/
		learn (spam.css) [:m_text:] /:*:lcr:/
		goto /:autotrain_finish:/
	}
	:autotrain_finish:
	syscall (:*:classifier_reason: :*:_nl: :*:_dw:) /mail -s "AUTO-TRAINED email - please check" :*:autotrain_address:/
}
:exit_here:
exit /:*:our_exit_code:/