From 11f692cc42975a03c56111647e0df233fd89140d Mon Sep 17 00:00:00 2001 From: Stacy Brock Date: Tue, 30 Mar 2021 09:46:40 -0700 Subject: [PATCH] Initial commit --- .cache/.gitignore | 4 + .gitignore | 2 + README.md | 3 + filter-rules.py | 180 ++++++++++++++++++++++++++++++++++++++++++ logs/.gitignore | 4 + mail-filter.conf-dist | 8 ++ mail-filter.py | 154 ++++++++++++++++++++++++++++++++++++ requirements.txt | 22 ++++++ 8 files changed, 377 insertions(+) create mode 100644 .cache/.gitignore create mode 100644 .gitignore create mode 100644 README.md create mode 100644 filter-rules.py create mode 100644 logs/.gitignore create mode 100644 mail-filter.conf-dist create mode 100644 mail-filter.py create mode 100644 requirements.txt diff --git a/.cache/.gitignore b/.cache/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/.cache/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..33a579b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +mail-filter.conf +__pycache__ diff --git a/README.md b/README.md new file mode 100644 index 0000000..40f7b20 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# o365-mail-filter + +... because spam takes many forms. diff --git a/filter-rules.py b/filter-rules.py new file mode 100644 index 0000000..290cf98 --- /dev/null +++ b/filter-rules.py @@ -0,0 +1,180 @@ +BLOCK_EMAIL = [ + 'chirhart@amazon.com', + 'ron.krogel@citrix.com', + 'replieswelcome@duo.com', + 'webinars@duo.com', + 'diversity@oregonstate.edu', + 'jramiro@pagerduty.com', + 'info@snowflake.com', + 'lisa@duo.com', + 'cortana@microsoft.com' +] + +BLOCK_KEYWORDS = [ + 'charitable fund drive', + 'election reminder', + 'email preferences', + 'end these email updates', + 'food drive', + 'food share', + 'general election', + 'giving tuesday', + 'hardship leave donations needed', + 'manage your preferences', + 'modify your preferences', + 'opt-out', + 'opt out', + 'prefer not to receive', + 'prefer not to see', + 'register to vote' + 'Samsung SDS America', + 'sidekickopen', + 'special election', + 'subscription preferences', + 'survey', + 'unsubscribe', + 'voter registration', + 'want to receive', + 'webinar', + 'whitepaper', + 'wish to receive', +] + +BLOCK_DOMAINS = [ +] + +ALLOW = [ + 'oregonstate.edu', + 'github.com', + 'duo.com', + 'sns.amazonaws.com', + 'opsgenie.net', + 'notify@teamdynamixapp.com', + 'newsbites@email.sans.org', + 'noreply@box.com', + 'noreply@email.teams.microsoft.com', + 'no-reply@sharepointonline.com', + 'slalom.com', + 'govdelivery.com', + 'linkoregon.org', + 'megan@pdxwit.org', + 'busyconf.com', + 'support@githubsupport.com', + 'microsoft.com' +] + +def filter_message(self, message): + # normalize message attributes + normalized_to = [x.address.lower() for x in message.to] + normalized_from = message.sender.address.lower() + normalized_subject = message.subject.lower() + + # filter alerts-sig + if (normalized_from in ['mcc-b11-stor1@oregonstate.edu', + 'mcc-b12-stor1@oregonstate.edu', + 'ousclus@oregonstate.edu', + 'isilon@storage.sig.oregonstate.edu'] + or 'alarm.DatastoreDiskUsageAlarm' in message.subject): + self._log_result(message, 'moving to alerts-sig') + message.move(self._folders['alerts-sig']) + return + + # filter conference spam + if 'brocks+conf@onid.oregonstate.edu' in normalized_to: + self._log_result(message, 'deleting conference spam') + message.delete() + return + + # filter backup-nightly + if 'backup-nightly@lists.oregonstate.edu' in message.to: + self._log_result(message, 'moving to backup-nightly') + message.move(self._folders['backup-nightly']) + return + + # delete HP alert spam + if normalized_from == 'alerts@alerts.mail.hpe.com': + self._log_result(message, 'deleting HP alert spam') + message.delete() + return + + # keep messages from allowed emails and domains + for good in ALLOW: + if good in normalized_from and normalized_from not in BLOCK_EMAIL: + self._log_result(message, + f"keeping message from allowed sender {good}") + return + + # junk messages from blocked senders + if normalized_from in BLOCK_EMAIL: + self._log_result(message, 'junking spam from blocked sender') + message.move(self._folders['Junk Email']) + return + + # junk messages with blocked keywords in message body + is_spam = False + message_body = message.body.lower() + for phrase in self._normalized['BLOCK_KEYWORDS']: + if phrase in normalized_subject: + is_spam = True + break + if phrase in message_body: + is_spam = True + break + if is_spam: + self._log_result(message, 'junking spam containing blocked keyword') + message.move(self._folders['Junk Email']) + return + + # process message headers into a sane data structure + headers = [] + for header in message.message_headers: + h = {} + h[header['name']] = header['value'].lower() + headers.append(h) + + # junk messages from blocked domains + for domain in BLOCK_DOMAINS: + if domain in message_from: + is_spam = True + break + if search_headers(domain): + is_spam = True + break + if is_spam: + self._log_result(message, 'junking spam from blocked domain') + return + + # junk known spam headers + if (get_header('X-Spam-Flag' == 'YES', headers) + or int(get_header('X-MS-Exchange-Organization-SCL', headers)) >= 5): + self._log_result(message, 'junking spam with known header') + message.move(self._folders['Junk Email']) + return + + # KEEP MESSAGE + self._log_result(message, 'keeping message, passed all filter checks') + +def normalize_lists(self): + self._normalized['BLOCK_KEYWORDS'] = [x.lower() for x in BLOCK_KEYWORDS] + +def get_header(header_key, headers): + vals = [] + for header in headers: + if header_key in header: + for val in header.values(): + vals.append(val) + if len(vals) > 1: + return vals + elif len(vals) == 1: + return vals[0] + else: + return False + +def search_headers(search, headers): + vals = [] + is_found = False + for header in headers: + for val in header.values(): + print(f"is {search} in {val}?") + if search in val: + return True diff --git a/logs/.gitignore b/logs/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/logs/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/mail-filter.conf-dist b/mail-filter.conf-dist new file mode 100644 index 0000000..c711fba --- /dev/null +++ b/mail-filter.conf-dist @@ -0,0 +1,8 @@ +[main] +Filters = +EnableDebugging = yes +MailCheckInterval = 60 + +[logging] +LogDir = +Timezone = America/Los_Angeles diff --git a/mail-filter.py b/mail-filter.py new file mode 100644 index 0000000..7d415ef --- /dev/null +++ b/mail-filter.py @@ -0,0 +1,154 @@ +import configparser +import logging +import logging.handlers +import os +import pendulum +import signal +import time +from importlib.machinery import SourceFileLoader +from O365 import Account, FileSystemTokenBackend + +SCRIPTPATH = os.path.dirname(os.path.abspath(__file__)) + +# parse config file +config = {} +configfile = configparser.ConfigParser() +configfile.read(SCRIPTPATH + '/mail-filter.conf') +config['FILTERS_FILE'] = configfile.get('main', 'Filters') +config['IS_DEBUG'] = configfile.getboolean('main', 'EnableDebugging') +config['CHECK_INTERVAL'] = int(configfile.get('main', 'MailCheckInterval')) +config['LOG_DIR'] = configfile.get('logging', 'LogDir') +config['TIMEZONE'] = configfile.get('logging', 'Timezone') +config['APP_CLIENT_ID'] = os.getenv('APP_CLIENT_ID') +config['APP_SECRET_KEY'] = os.getenv('APP_SECRET_KEY') +config['APP_TENANT_ID'] = os.getenv('APP_TENANT_ID') + +# convert timestamp to local time +def local_time(record, datefmt=None): + return pendulum.from_timestamp( + record.created, + tz=pendulum.timezone(config['TIMEZONE']) + ).strftime('%Y-%m-%d %H:%M:%S %z') + +# set up logger +logger = logging.getLogger('o365mf') +if config['IS_DEBUG']: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) +formatter = logging.Formatter( + '%(asctime)s %(module)s [%(levelname)s] %(message)s') +formatter.formatTime = local_time +log_filename = f"{config['LOG_DIR']}/mail-filter.log" +handler = logging.handlers.TimedRotatingFileHandler( + log_filename, when='midnight', backupCount=5) +handler.setFormatter(formatter) +logger.addHandler(handler) + + +class O365MailFilter(object): + _scopes = [ + 'basic', + 'https://graph.microsoft.com/Mail.ReadWrite' + ] + + def __init__(self, config): + self._config = config + self._is_canceled = False + self._folders = {} + self._normalized = {} + + # auth with O365 + self._authenticate() + + def _authenticate(self): + token_backend = FileSystemTokenBackend(token_path='.cache', + token_filename='token.txt') + + self._account = Account( + (self._config['APP_CLIENT_ID'], self._config['APP_SECRET_KEY']), + tenant_id=self._config['APP_TENANT_ID'], + token_backend=token_backend + ) + + if not self._account.is_authenticated: + self._account.authenticate(scopes=self._scopes) + + logger.info('Authentication successful') + + def _load_filters(self): + """ load filter code from a file on disk """ + loader = SourceFileLoader('filters', self._config['FILTERS_FILE']) + module = loader.load_module() + module.normalize_lists(self) + # make 'filter_message()' implemented in the file available for use + # within this class as 'self._filter_message()' + self._filter_message = module.filter_message + + def _load_folders(self): + """ retrieve folders for this mailbox and cache their ids """ + self._folders = {} + + mailbox = self._account.mailbox() + folders = mailbox.get_folders() + + for folder in folders: + self._folders[folder.name] = folder.folder_id + + def _repr_message(self, message): + """ returns a str representation of a message suitable for logging """ + # to = ','.join([r.address for r in message.to]) + return f"[FROM: {message.sender.address} SUBJ: {message.subject}]" + + def _log_result(self, message, result): + logger.info(f"{self._repr_message(message)} RESULT: {result}") + + def filter(self): + self._load_filters() + self._load_folders() + + mailbox = self._account.mailbox() + inbox = mailbox.inbox_folder() + + # set limit to max allowed by O365, which is 999 messages + # we have to explicitly set a limit value or the O365 library will not + # paginate results correctly + limit = self._account.protocol.max_top_value + query = inbox.new_query() + query = query.on_attribute('isRead').equals(False).select( + 'to_recipients', 'from', 'subject', 'body', + 'internet_message_headers' + ) + messages = inbox.get_messages(query=query, limit=limit, batch=25) + + for message in messages: + self._filter_message(self, message) + + def run(self): + """ run filter as a loop """ + while not self._is_canceled: + self.filter() + time.sleep(self._config['CHECK_INTERVAL']) + + logger.info('Done.') + + def exit(self): + self._is_canceled = True + + +logger.info('Initializing O365 mail filter...') +o365mf = O365MailFilter(config) + +def exit(signum, frame): + """ signal handler for a clean exit """ + logger.info(f"Caught signal {signum}, exiting...") + o365mf.exit() + +if __name__ == '__main__': + # register signal handlers + signal.signal(signal.SIGTERM, exit) + signal.signal(signal.SIGHUP, exit) + signal.signal(signal.SIGINT, exit) + + # run it + o365mf.run() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9a19449 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +beautifulsoup4==4.9.1 +boto3==1.14.18 +botocore==1.17.18 +certifi==2020.6.20 +chardet==3.0.4 +docutils==0.15.2 +idna==2.10 +jmespath==0.10.0 +O365==2.0.10 +oauthlib==3.1.0 +pendulum==2.1.0 +python-dateutil==2.8.1 +pytz==2020.1 +pytzdata==2019.3 +requests==2.24.0 +requests-oauthlib==1.3.0 +s3transfer==0.3.3 +six==1.15.0 +soupsieve==2.0.1 +stringcase==1.2.0 +tzlocal==2.1 +urllib3==1.25.9