Initial commit

This commit is contained in:
Stacy Brock
2021-03-30 09:46:40 -07:00
commit 11f692cc42
8 changed files with 377 additions and 0 deletions

4
.cache/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
mail-filter.conf
__pycache__

3
README.md Normal file
View File

@@ -0,0 +1,3 @@
# o365-mail-filter
... because spam takes many forms.

180
filter-rules.py Normal file
View File

@@ -0,0 +1,180 @@
BLOCK_EMAIL = [
'chirhart@amazon.com',
'ron.krogel@citrix.com',
'replieswelcome@duo.com',
'webinars@duo.com',
'diversity@oregonstate.edu',
'jramiro@pagerduty.com',
'info@snowflake.com',
'lisa@duo.com',
'cortana@microsoft.com'
]
BLOCK_KEYWORDS = [
'charitable fund drive',
'election reminder',
'email preferences',
'end these email updates',
'food drive',
'food share',
'general election',
'giving tuesday',
'hardship leave donations needed',
'manage your preferences',
'modify your preferences',
'opt-out',
'opt out',
'prefer not to receive',
'prefer not to see',
'register to vote'
'Samsung SDS America',
'sidekickopen',
'special election',
'subscription preferences',
'survey',
'unsubscribe',
'voter registration',
'want to receive',
'webinar',
'whitepaper',
'wish to receive',
]
BLOCK_DOMAINS = [
]
ALLOW = [
'oregonstate.edu',
'github.com',
'duo.com',
'sns.amazonaws.com',
'opsgenie.net',
'notify@teamdynamixapp.com',
'newsbites@email.sans.org',
'noreply@box.com',
'noreply@email.teams.microsoft.com',
'no-reply@sharepointonline.com',
'slalom.com',
'govdelivery.com',
'linkoregon.org',
'megan@pdxwit.org',
'busyconf.com',
'support@githubsupport.com',
'microsoft.com'
]
def filter_message(self, message):
# normalize message attributes
normalized_to = [x.address.lower() for x in message.to]
normalized_from = message.sender.address.lower()
normalized_subject = message.subject.lower()
# filter alerts-sig
if (normalized_from in ['mcc-b11-stor1@oregonstate.edu',
'mcc-b12-stor1@oregonstate.edu',
'ousclus@oregonstate.edu',
'isilon@storage.sig.oregonstate.edu']
or 'alarm.DatastoreDiskUsageAlarm' in message.subject):
self._log_result(message, 'moving to alerts-sig')
message.move(self._folders['alerts-sig'])
return
# filter conference spam
if 'brocks+conf@onid.oregonstate.edu' in normalized_to:
self._log_result(message, 'deleting conference spam')
message.delete()
return
# filter backup-nightly
if 'backup-nightly@lists.oregonstate.edu' in message.to:
self._log_result(message, 'moving to backup-nightly')
message.move(self._folders['backup-nightly'])
return
# delete HP alert spam
if normalized_from == 'alerts@alerts.mail.hpe.com':
self._log_result(message, 'deleting HP alert spam')
message.delete()
return
# keep messages from allowed emails and domains
for good in ALLOW:
if good in normalized_from and normalized_from not in BLOCK_EMAIL:
self._log_result(message,
f"keeping message from allowed sender {good}")
return
# junk messages from blocked senders
if normalized_from in BLOCK_EMAIL:
self._log_result(message, 'junking spam from blocked sender')
message.move(self._folders['Junk Email'])
return
# junk messages with blocked keywords in message body
is_spam = False
message_body = message.body.lower()
for phrase in self._normalized['BLOCK_KEYWORDS']:
if phrase in normalized_subject:
is_spam = True
break
if phrase in message_body:
is_spam = True
break
if is_spam:
self._log_result(message, 'junking spam containing blocked keyword')
message.move(self._folders['Junk Email'])
return
# process message headers into a sane data structure
headers = []
for header in message.message_headers:
h = {}
h[header['name']] = header['value'].lower()
headers.append(h)
# junk messages from blocked domains
for domain in BLOCK_DOMAINS:
if domain in message_from:
is_spam = True
break
if search_headers(domain):
is_spam = True
break
if is_spam:
self._log_result(message, 'junking spam from blocked domain')
return
# junk known spam headers
if (get_header('X-Spam-Flag' == 'YES', headers)
or int(get_header('X-MS-Exchange-Organization-SCL', headers)) >= 5):
self._log_result(message, 'junking spam with known header')
message.move(self._folders['Junk Email'])
return
# KEEP MESSAGE
self._log_result(message, 'keeping message, passed all filter checks')
def normalize_lists(self):
self._normalized['BLOCK_KEYWORDS'] = [x.lower() for x in BLOCK_KEYWORDS]
def get_header(header_key, headers):
vals = []
for header in headers:
if header_key in header:
for val in header.values():
vals.append(val)
if len(vals) > 1:
return vals
elif len(vals) == 1:
return vals[0]
else:
return False
def search_headers(search, headers):
vals = []
is_found = False
for header in headers:
for val in header.values():
print(f"is {search} in {val}?")
if search in val:
return True

4
logs/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore

8
mail-filter.conf-dist Normal file
View File

@@ -0,0 +1,8 @@
[main]
Filters =
EnableDebugging = yes
MailCheckInterval = 60
[logging]
LogDir =
Timezone = America/Los_Angeles

154
mail-filter.py Normal file
View File

@@ -0,0 +1,154 @@
import configparser
import logging
import logging.handlers
import os
import pendulum
import signal
import time
from importlib.machinery import SourceFileLoader
from O365 import Account, FileSystemTokenBackend
SCRIPTPATH = os.path.dirname(os.path.abspath(__file__))
# parse config file
config = {}
configfile = configparser.ConfigParser()
configfile.read(SCRIPTPATH + '/mail-filter.conf')
config['FILTERS_FILE'] = configfile.get('main', 'Filters')
config['IS_DEBUG'] = configfile.getboolean('main', 'EnableDebugging')
config['CHECK_INTERVAL'] = int(configfile.get('main', 'MailCheckInterval'))
config['LOG_DIR'] = configfile.get('logging', 'LogDir')
config['TIMEZONE'] = configfile.get('logging', 'Timezone')
config['APP_CLIENT_ID'] = os.getenv('APP_CLIENT_ID')
config['APP_SECRET_KEY'] = os.getenv('APP_SECRET_KEY')
config['APP_TENANT_ID'] = os.getenv('APP_TENANT_ID')
# convert timestamp to local time
def local_time(record, datefmt=None):
return pendulum.from_timestamp(
record.created,
tz=pendulum.timezone(config['TIMEZONE'])
).strftime('%Y-%m-%d %H:%M:%S %z')
# set up logger
logger = logging.getLogger('o365mf')
if config['IS_DEBUG']:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(asctime)s %(module)s [%(levelname)s] %(message)s')
formatter.formatTime = local_time
log_filename = f"{config['LOG_DIR']}/mail-filter.log"
handler = logging.handlers.TimedRotatingFileHandler(
log_filename, when='midnight', backupCount=5)
handler.setFormatter(formatter)
logger.addHandler(handler)
class O365MailFilter(object):
_scopes = [
'basic',
'https://graph.microsoft.com/Mail.ReadWrite'
]
def __init__(self, config):
self._config = config
self._is_canceled = False
self._folders = {}
self._normalized = {}
# auth with O365
self._authenticate()
def _authenticate(self):
token_backend = FileSystemTokenBackend(token_path='.cache',
token_filename='token.txt')
self._account = Account(
(self._config['APP_CLIENT_ID'], self._config['APP_SECRET_KEY']),
tenant_id=self._config['APP_TENANT_ID'],
token_backend=token_backend
)
if not self._account.is_authenticated:
self._account.authenticate(scopes=self._scopes)
logger.info('Authentication successful')
def _load_filters(self):
""" load filter code from a file on disk """
loader = SourceFileLoader('filters', self._config['FILTERS_FILE'])
module = loader.load_module()
module.normalize_lists(self)
# make 'filter_message()' implemented in the file available for use
# within this class as 'self._filter_message()'
self._filter_message = module.filter_message
def _load_folders(self):
""" retrieve folders for this mailbox and cache their ids """
self._folders = {}
mailbox = self._account.mailbox()
folders = mailbox.get_folders()
for folder in folders:
self._folders[folder.name] = folder.folder_id
def _repr_message(self, message):
""" returns a str representation of a message suitable for logging """
# to = ','.join([r.address for r in message.to])
return f"[FROM: {message.sender.address} SUBJ: {message.subject}]"
def _log_result(self, message, result):
logger.info(f"{self._repr_message(message)} RESULT: {result}")
def filter(self):
self._load_filters()
self._load_folders()
mailbox = self._account.mailbox()
inbox = mailbox.inbox_folder()
# set limit to max allowed by O365, which is 999 messages
# we have to explicitly set a limit value or the O365 library will not
# paginate results correctly
limit = self._account.protocol.max_top_value
query = inbox.new_query()
query = query.on_attribute('isRead').equals(False).select(
'to_recipients', 'from', 'subject', 'body',
'internet_message_headers'
)
messages = inbox.get_messages(query=query, limit=limit, batch=25)
for message in messages:
self._filter_message(self, message)
def run(self):
""" run filter as a loop """
while not self._is_canceled:
self.filter()
time.sleep(self._config['CHECK_INTERVAL'])
logger.info('Done.')
def exit(self):
self._is_canceled = True
logger.info('Initializing O365 mail filter...')
o365mf = O365MailFilter(config)
def exit(signum, frame):
""" signal handler for a clean exit """
logger.info(f"Caught signal {signum}, exiting...")
o365mf.exit()
if __name__ == '__main__':
# register signal handlers
signal.signal(signal.SIGTERM, exit)
signal.signal(signal.SIGHUP, exit)
signal.signal(signal.SIGINT, exit)
# run it
o365mf.run()

22
requirements.txt Normal file
View File

@@ -0,0 +1,22 @@
beautifulsoup4==4.9.1
boto3==1.14.18
botocore==1.17.18
certifi==2020.6.20
chardet==3.0.4
docutils==0.15.2
idna==2.10
jmespath==0.10.0
O365==2.0.10
oauthlib==3.1.0
pendulum==2.1.0
python-dateutil==2.8.1
pytz==2020.1
pytzdata==2019.3
requests==2.24.0
requests-oauthlib==1.3.0
s3transfer==0.3.3
six==1.15.0
soupsieve==2.0.1
stringcase==1.2.0
tzlocal==2.1
urllib3==1.25.9