[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [support-tools/master] Add script to sort out spam from ham depending on ticket status
commit bbdf8b1868a21006fc8eaae72b35d16c5183f1d8
Author: Lunar <lunar@xxxxxxxxxxxxxx>
Date: Fri Oct 31 14:32:40 2014 +0100
Add script to sort out spam from ham depending on ticket status
---
train-spam-filters/train_spam_filters | 119 +++++++++++++++++++++++++++++++++
1 file changed, 119 insertions(+)
diff --git a/train-spam-filters/train_spam_filters b/train-spam-filters/train_spam_filters
new file mode 100755
index 0000000..c8f64dc
--- /dev/null
+++ b/train-spam-filters/train_spam_filters
@@ -0,0 +1,119 @@
+#!/usr/bin/python
+#
+# This program is free software. It comes without any warranty, to
+# the extent permitted by applicable law. You can redistribute it
+# and/or modify it under the terms of the Do What The Fuck You Want
+# To Public License, Version 2, as published by Sam Hocevar. See
+# http://sam.zoy.org/wtfpl/COPYING for more details.
+
+from __future__ import print_function
+
+import email.parser
+import psycopg2
+import os
+import os.path
+from datetime import datetime, timedelta
+
+DEBUG = False
+
+MAILDIR_ROOT = os.path.join(os.environ['HOME'], 'Maildir')
+SPAM_MAILDIR = '.spam.learn'
+HAM_MAILDIR = '.xham.learn'
+
+KEEP_FOR_MAX_DAYS = 100
+
+RT_CONNINFO = "host=drobovi.torproject.org sslmode=require user=rtreader dbname=rt"
+
+SELECT_HAM_TICKET_QUERY = """
+ SELECT DISTINCT Tickets.Id
+ FROM Queues, Tickets, Transactions
+ LEFT OUTER JOIN Attachments ON Attachments.TransactionId = Transactions.Id
+ WHERE Queues.Name LIKE 'help%%'
+ AND Tickets.Queue = Queues.Id
+ AND Tickets.Status = 'resolved'
+ AND Transactions.ObjectId = Tickets.Id
+ AND Transactions.ObjectType = 'RT::Ticket'
+ AND Attachments.MessageId = %s;
+"""
+
+SELECT_SPAM_TICKET_QUERY = """
+ SELECT DISTINCT Tickets.Id
+ FROM Queues, Tickets, Transactions
+ LEFT OUTER JOIN Attachments ON Attachments.TransactionId = Transactions.Id
+ WHERE Queues.Name = 'spam'
+ AND Tickets.Queue = Queues.Id
+ AND Tickets.Status = 'rejected'
+ AND Transactions.ObjectId = Tickets.Id
+ AND Transactions.ObjectType = 'RT::Ticket'
+ AND Attachments.MessageId = %s;
+"""
+
+EMAIL_PARSER = email.parser.Parser()
+
+if DEBUG:
+ def log(msg):
+ print(msg)
+else:
+ def log(msg):
+ pass
+
+def is_ham(msg_id):
+ global con
+
+ cur = con.cursor()
+ try:
+ cur.execute(SELECT_HAM_TICKET_QUERY, (msg_id,))
+ return cur.fetchone() is not None
+ finally:
+ cur.close()
+
+def is_spam(msg_id):
+ global con
+
+ cur = con.cursor()
+ try:
+ cur.execute(SELECT_SPAM_TICKET_QUERY, (msg_id,))
+ return cur.fetchone() is not None
+ finally:
+ cur.close()
+
+def handle_message(path):
+ msg = EMAIL_PARSER.parse(open(path), headersonly=True)
+ msg_id = msg['Message-Id']
+ if not msg_id.startswith('<') or not msg_id.endswith('>'):
+ log("%s: bad Message-Id, removing." % path)
+ print("os.unlink(" + path)
+ return
+ msg_id = msg_id[1:-1]
+ if is_ham(msg_id):
+ os.rename(path, os.path.join(MAILDIR_ROOT, HAM_MAILDIR, 'cur', os.path.basename(path)))
+ log("%s: ham, moving." % path)
+ return
+ if is_spam(msg_id):
+ os.rename(path, os.path.join(MAILDIR_ROOT, SPAM_MAILDIR, 'cur', os.path.basename(path)))
+ log("%s: spam, moving." % path)
+ return
+ mtime = datetime.fromtimestamp(os.stat(path).st_mtime)
+ limit = datetime.now() - timedelta(days=KEEP_FOR_MAX_DAYS)
+ if mtime <= limit:
+ log("%s: too old, removing." % path)
+ print("os.unlink(" + path)
+ return
+ # well, it's not identified ham, not identified spam, and not too old
+ # let's keep the message for now
+ log("%s: unknown, keeping." % path)
+
+def scan_directory(dir_path):
+ for filename in os.listdir(dir_path):
+ path = os.path.join(dir_path, filename)
+ handle_message(path)
+
+con = None
+
+if __name__ == '__main__':
+ con = psycopg2.connect(RT_CONNINFO)
+ for filename in os.listdir(MAILDIR_ROOT):
+ if filename.startswith('.help'):
+ for subdir in ['new', 'cur', 'tmp']:
+ scan_directory(os.path.join(MAILDIR_ROOT, filename, subdir))
+ con.close()
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits