[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[bridgedb/develop] Move email address parsers from bridgedb.Dist → bridgedb.parse.addr.
commit dfe81deffb272e4585af141ad58ece7d68e352f8
Author: Isis Lovecruft <isis@xxxxxxxxxxxxxx>
Date: Tue Apr 8 21:37:11 2014 +0000
Move email address parsers from bridgedb.Dist â?? bridgedb.parse.addr.
---
lib/bridgedb/Dist.py | 117 +++-------------------
lib/bridgedb/EmailServer.py | 41 ++++----
lib/bridgedb/Tests.py | 9 +-
lib/bridgedb/parse/addr.py | 171 +++++++++++++++++++++++++++++++--
lib/bridgedb/test/test_EmailServer.py | 2 +-
5 files changed, 208 insertions(+), 132 deletions(-)
diff --git a/lib/bridgedb/Dist.py b/lib/bridgedb/Dist.py
index 5b5a602..4e0a1aa 100644
--- a/lib/bridgedb/Dist.py
+++ b/lib/bridgedb/Dist.py
@@ -28,9 +28,20 @@ from bridgedb.Filters import filterAssignBridgesToRing
from bridgedb.Filters import filterBridgesByRules
from bridgedb.Filters import filterBridgesByIP4
from bridgedb.Filters import filterBridgesByIP6
+from bridgedb.parse import addr
+from bridgedb.parse.addr import UnsupportedDomain
from bridgedb.safelog import logSafely
+MAX_EMAIL_RATE = 3*3600
+
+class IgnoreEmail(addr.BadEmail):
+ """Raised when we get requests from this address after rate warning."""
+
+class TooSoonEmail(addr.BadEmail):
+ """Raised when we got a request from this address too recently."""
+
+
def uniformMap(ip):
"""Map an IP to an arbitrary 'area' string, such that any two /24 addresses
get the same string.
@@ -322,103 +333,6 @@ class IPBasedDistributor(Distributor):
def dumpAssignments(self, f, description=""):
self.splitter.dumpAssignments(f, description)
-
-# These characters are the ones that RFC2822 allows.
-#ASPECIAL = '!#$%&*+-/=?^_`{|}~'
-#ASPECIAL += "\\\'"
-# These are the ones we're pretty sure we can handle right.
-ASPECIAL = '-_+/=_~'
-
-ACHAR = r'[\w%s]' % "".join("\\%s"%c for c in ASPECIAL)
-DOTATOM = r'%s+(?:\.%s+)*' % (ACHAR,ACHAR)
-DOMAIN = r'\w+(?:\.\w+)*'
-ADDRSPEC = r'(%s)\@(%s)' % (DOTATOM, DOMAIN)
-
-SPACE_PAT = re.compile(r'\s+')
-ADDRSPEC_PAT = re.compile(ADDRSPEC)
-
-MAX_EMAIL_RATE = 3*3600
-
-class BadEmail(Exception):
- """Exception raised when we get a bad email address."""
- def __init__(self, msg, email):
- Exception.__init__(self, msg)
- self.email = email
-
-class UnsupportedDomain(BadEmail):
- """Exception raised when we get an email address from a domain we
- don't know."""
-
-class TooSoonEmail(BadEmail):
- """Raised when we got a request from this address too recently."""
-
-class IgnoreEmail(BadEmail):
- """Raised when we get requests from this address after rate warning."""
-
-def extractAddrSpec(addr):
- """Given an email From line, try to extract and parse the addrspec
- portion. Returns localpart,domain on success; raises BadEmail
- on failure.
- """
- orig_addr = addr
- addr = SPACE_PAT.sub(' ', addr)
- addr = addr.strip()
- # Only works on usual-form addresses; raises BadEmail on weird
- # address form. That's okay, since we'll only get those when
- # people are trying to fool us.
- if '<' in addr:
- # Take the _last_ index of <, so that we don't need to bother
- # with quoting tricks.
- idx = addr.rindex('<')
- addr = addr[idx:]
- m = re.search(r'<([^>]*)>', addr)
- if m is None:
- raise BadEmail("Couldn't extract address spec", orig_addr)
- addr = m.group(1)
-
- # At this point, addr holds a putative addr-spec. We only allow the
- # following form:
- # addr-spec = local-part "@" domain
- # local-part = dot-atom
- # domain = dot-atom
- #
- # In particular, we are disallowing: obs-local-part, obs-domain,
- # comment, obs-FWS,
- #
- # Other forms exist, but none of the incoming services we recognize
- # support them.
- addr = addr.replace(" ", "")
- m = ADDRSPEC_PAT.match(addr)
- if not m:
- raise BadEmail("Bad address spec format", orig_addr)
- localpart, domain = m.groups()
- return localpart, domain
-
-def normalizeEmail(addr, domainmap, domainrules):
- """Given the contents of a from line, and a map of supported email
- domains (in lowercase), raise BadEmail or return a normalized
- email address.
- """
- addr = addr.lower()
- localpart, domain = extractAddrSpec(addr)
- if domainmap is not None:
- domain = domainmap.get(domain, None)
- if domain is None:
- raise UnsupportedDomain("Domain not supported", addr)
-
- #XXXX Do these rules also hold for Yahoo?
-
- # addr+foo@ is an alias for addr@
- idx = localpart.find('+')
- if idx >= 0:
- localpart = localpart[:idx]
- rules = domainrules.get(domain, [])
- if 'ignore_dots' in rules:
- # j.doe@ is the same as jdoe@.
- localpart = localpart.replace(".", "")
-
- return "%s@%s"%(localpart, domain)
-
class EmailBasedDistributor(Distributor):
"""Object that hands out bridges based on the email address of an incoming
request and the current time period.
@@ -475,12 +389,13 @@ class EmailBasedDistributor(Distributor):
bridgeFilterRules=[]
now = time.time()
try:
- emailaddress = normalizeEmail(emailaddress, self.domainmap,
- self.domainrules)
- except BadEmail as err:
+ emailaddress = addr.normalizeEmail(emailaddress, self.domainmap,
+ self.domainrules)
+ except addr.BadEmail as err:
logging.warn(err)
return []
- if emailaddress is None:
+
+ if not emailaddress:
return [] #XXXX raise an exception.
with bridgedb.Storage.getDB() as db:
diff --git a/lib/bridgedb/EmailServer.py b/lib/bridgedb/EmailServer.py
index f9f43bb..e1aa57e 100644
--- a/lib/bridgedb/EmailServer.py
+++ b/lib/bridgedb/EmailServer.py
@@ -26,7 +26,6 @@ from twisted.mail import smtp
from zope.interface import implements
-from bridgedb.Dist import BadEmail, TooSoonEmail, IgnoreEmail
from bridgedb import Dist
from bridgedb import I18n
from bridgedb import safelog
@@ -34,6 +33,10 @@ from bridgedb.Filters import filterBridgesByIP6
from bridgedb.Filters import filterBridgesByIP4
from bridgedb.Filters import filterBridgesByTransport
from bridgedb.Filters import filterBridgesByNotBlockedIn
+from bridgedb.parse import addr
+from bridgedb.parse.addr import BadEmail
+from bridgedb.parse.addr import UnsupportedDomain
+from bridgedb.parse.addr import canonicalizeEmailDomain
class MailFile:
@@ -97,23 +100,24 @@ def getMailResponse(lines, ctx):
lang = getLocaleFromPlusAddr(clientToaddr)
t = I18n.getLang(lang)
+ canon = ctx.cfg.EMAIL_DOMAIN_MAP
+ for domain, rule in ctx.cfg.EMAIL_DOMAIN_RULES.items():
+ if domain not in canon.keys():
+ canon[domain] = domain
+ for domain in ctx.cfg.EMAIL_DOMAINS:
+ canon[domain] = domain
+
try:
- _, addrdomain = Dist.extractAddrSpec(clientAddr.lower())
- except BadEmail:
- logging.info("Ignoring bad address on incoming email.")
+ _, clientDomain = addr.extractEmailAddress(clientAddr.lower())
+ canonical = canonicalizeEmailDomain(clientDomain, canon)
+ except UnsupportedDomain as error:
+ logging.warn(error)
return None, None
-
- if not addrdomain:
- logging.info("Couldn't parse domain from %r" % clientAddr)
-
- if addrdomain and ctx.cfg.EMAIL_DOMAIN_MAP:
- addrdomain = ctx.cfg.EMAIL_DOMAIN_MAP.get(addrdomain, addrdomain)
-
- if addrdomain not in ctx.cfg.EMAIL_DOMAINS:
- logging.warn("Unrecognized email domain %r", addrdomain)
+ except BadEmail as error:
+ logging.warn(error)
return None, None
- rules = ctx.cfg.EMAIL_DOMAIN_RULES.get(addrdomain, [])
+ rules = ctx.cfg.EMAIL_DOMAIN_RULES.get(canonical, [])
if 'dkim' in rules:
# getheader() returns the last of a given kind of header; we want
@@ -123,8 +127,8 @@ def getMailResponse(lines, ctx):
if dkimHeaders:
dkimHeader = dkimHeaders[0]
if not dkimHeader.startswith("pass"):
- logging.info("Got a bad dkim header (%r) on an incoming mail; "
- "rejecting it.", dkimHeader)
+ logging.info("Rejecting bad DKIM header on incoming email: %r "
+ % dkimHeader)
return None, None
# Was the magic string included
@@ -186,17 +190,16 @@ def getMailResponse(lines, ctx):
bridgeFilterRules=bridgeFilterRules)
# Handle rate limited email
- except TooSoonEmail as err:
+ except Dist.TooSoonEmail as err:
logging.info("Got a mail too frequently; warning '%s': %s."
% (clientAddr, err))
- # Compose a warning email
# MAX_EMAIL_RATE is in seconds, convert to hours
body = buildSpamWarningTemplate(t) % (Dist.MAX_EMAIL_RATE / 3600)
return composeEmail(ctx.fromAddr, clientAddr, subject, body, msgID,
gpgContext=ctx.gpgContext)
- except IgnoreEmail as err:
+ except Dist.IgnoreEmail as err:
logging.info("Got a mail too frequently; ignoring '%s': %s."
% (clientAddr, err))
return None, None
diff --git a/lib/bridgedb/Tests.py b/lib/bridgedb/Tests.py
index 72dfe5e..4147549 100644
--- a/lib/bridgedb/Tests.py
+++ b/lib/bridgedb/Tests.py
@@ -232,10 +232,11 @@ class EmailBridgeDistTests(unittest.TestCase):
def testUnsupportedDomain(self):
db = self.db
- self.assertRaises(bridgedb.Dist.UnsupportedDomain,
- bridgedb.Dist.normalizeEmail, 'bad@xxxxxxxxx',
- {'example.com':'example.com'},
- {'example.com':[]})
+ self.assertRaises(bridgedb.parse.addr.UnsupportedDomain,
+ bridgedb.parse.addr.normalizeEmail,
+ 'bad@xxxxxxxxx',
+ {'example.com':'example.com'},
+ {'example.com':[]})
class IPBridgeDistTests(unittest.TestCase):
def dumbAreaMapper(self, ip):
diff --git a/lib/bridgedb/parse/addr.py b/lib/bridgedb/parse/addr.py
index 455b953..f34f416 100644
--- a/lib/bridgedb/parse/addr.py
+++ b/lib/bridgedb/parse/addr.py
@@ -13,19 +13,22 @@
** Module Overview: **
-..
+::
parse
||_ parse.addr
- | |_ isIPAddress - Check if an arbitrary string is an IP address.
- | |_ isIPv4 - Check if an arbitrary string is an IPv4 address.
- | |_ isIPv6 - Check if an arbitrary string is an IPv6 address.
- | \_ isValidIP - Check that an IP address is valid.
+ | | |_ extractEmailAddress - Validate a :rfc:2822 email address.
+ | | |_ isIPAddress - Check if an arbitrary string is an IP address.
+ | | |_ isIPv4 - Check if an arbitrary string is an IPv4 address.
+ | | |_ isIPv6 - Check if an arbitrary string is an IPv6 address.
+ | | \_ isValidIP - Check that an IP address is valid.
+ | |
+ | |_ :class:`PortList` - A container class for validated port ranges.
|
- |__ :mod:`bridgedbparse.headers`
+ |__ :mod:`bridgedb.parse.headers`
|__ :mod:`bridgedb.parse.options`
\__ :mod:`bridgedb.parse.versions`
-..
+::
Private IP Address Ranges:
''''''''''''''''''''''''''
@@ -147,12 +150,119 @@ from __future__ import print_function
from __future__ import unicode_literals
import logging
+import re
+
import ipaddr
+#: These are the special characters which RFC2822 allows within email addresses:
+#ASPECIAL = '!#$%&*+-/=?^_`{|}~' + "\\\'"
+#: These are the ones we're pretty sure we can handle right:
+ASPECIAL = '-_+/=_~'
+ACHAR = r'[\w%s]' % "".join("\\%s" % c for c in ASPECIAL)
+DOTATOM = r'%s+(?:\.%s+)*' % (ACHAR, ACHAR)
+DOMAIN = r'\w+(?:\.\w+)*'
+ADDRSPEC = r'(%s)\@(%s)' % (DOTATOM, DOMAIN)
+SPACE_PAT = re.compile(r'\s+')
+#: A compiled regex with matches RFC2822 email address strings:
+ADDRSPEC_PAT = re.compile(ADDRSPEC)
+
+
+class BadEmail(Exception):
+ """Exception raised when we get a bad email address."""
+ def __init__(self, msg, email):
+ Exception.__init__(self, msg)
+ self.email = email
+
class InvalidPort(ValueError):
"""Raised when a given port number is invalid."""
+class UnsupportedDomain(ValueError):
+ """Raised when we get an email address from an unsupported domain."""
+
+
+def canonicalizeEmailDomain(domain, domainmap):
+ """Decide if an email was sent from a permitted domain.
+
+ :param str domain: The domain portion of an email address to validate. It
+ will be checked that it is one of the domains allowed to email
+ requests for bridges to the
+ :class:`~bridgedb.Dist.EmailBasedDistributor`.
+ :param dict domainmap: A map of permitted alternate domains (in lowercase)
+ to their canonical domain names (in lowercase). This can be configured
+ with the ``EMAIL_DOMAIN_MAP`` option in ``bridgedb.conf``, for
+ example::
+ EMAIL_DOMAIN_MAP = {'mail.google.com': 'gmail.com',
+ 'googlemail.com': 'gmail.com'}
+ :raises UnsupportedDomain: if the domain portion of the email address is
+ not within the map of alternate to canonical allowed domain names.
+ :rtype: str
+ :returns: The canonical domain name for the email address.
+ """
+ permitted = None
+
+ try:
+ permitted = domainmap.get(domain)
+ except AttributeError:
+ logging.debug("Got non-dict for 'domainmap' parameter: %r" % domainmap)
+
+ if not permitted:
+ raise UnsupportedDomain("Domain not permitted: %s" % domain)
+
+ return permitted
+
+def extractEmailAddress(emailaddr):
+ """Given an email address, obtained, for example, via a ``From:`` or
+ ``Sender:`` email header, try to extract and parse (according to
+ :rfc:2822) the username and domain portions. Returns ``(username,
+ domain)`` on success; raises BadEmail on failure.
+
+ We only allow the following form::
+ ADDRSPEC := LOCAL_PART "@" DOMAIN
+ LOCAL_PART := DOTATOM
+ DOMAIN := DOTATOM
+
+ In particular, we are disallowing: obs-local-part, obs-domain, comment,
+ and obs-FWS. Other forms exist, but none of the incoming services we
+ recognize support them.
+
+ :param emailaddr: An email address to validate.
+ :raises BadEmail: if the **emailaddr** couldn't be validated or parsed.
+ :rtype: tuple
+ :returns: A tuple of the validated email address, containing the mail
+ username and the domain::
+ (LOCALPART, DOMAIN)
+ """
+ orig = emailaddr
+
+ try:
+ addr = SPACE_PAT.sub(' ', emailaddr).strip()
+ except TypeError as error:
+ logging.debug(error)
+ raise BadEmail("Can't extract address from object type %r!"
+ % type(orig), orig)
+
+ # Only works on usual-form addresses; raises BadEmail on weird
+ # address form. That's okay, since we'll only get those when
+ # people are trying to fool us.
+ if '<' in addr:
+ # Take the _last_ index of <, so that we don't need to bother
+ # with quoting tricks.
+ idx = addr.rindex('<')
+ addr = addr[idx:]
+ m = re.search(r'<([^>]*)>', addr)
+ if m is None:
+ raise BadEmail("Couldn't extract address spec", orig)
+ addr = m.group(1)
+
+ # At this point, addr holds a putative addr-spec.
+ addr = addr.replace(" ", "")
+ m = ADDRSPEC_PAT.match(addr)
+ if not m:
+ raise BadEmail("Bad address spec format", orig)
+
+ localpart, domain = m.groups()
+ return localpart, domain
def isIPAddress(ip, compressed=True):
"""Check if an arbitrary string is an IP address, and that it's valid.
@@ -275,6 +385,53 @@ def isValidIP(ip):
return False
return True
+def normalizeEmail(emailaddr, domainmap, domainrules, ignorePlus=True):
+ """Normalise an email address according to the processing rules for its
+ canonical originating domain.
+
+ The email address, **emailaddr**, will be parsed and validated, and then
+ checked that it originated from one of the domains allowed to email
+ requests for bridges to the :class:`~bridgedb.Dist.EmailBasedDistributor`
+ via the :func:`canonicaliseEmailDomain` function.
+
+ :param str emailaddr: An email address to normalise.
+ :param dict domainmap: A map of permitted alternate domains (in lowercase)
+ to their canonical domain names (in lowercase). This can be configured
+ with the ``EMAIL_DOMAIN_MAP`` option in ``bridgedb.conf``, for
+ example::
+ EMAIL_DOMAIN_MAP = {'mail.google.com': 'gmail.com',
+ 'googlemail.com': 'gmail.com'}
+ :param dict domainrules: A mapping of canonical permitted domain names to
+ a list of rules which should be applied to processing them, for
+ example::
+ EMAIL_DOMAIN_RULES = {'gmail.com': ["ignore_dots", "dkim"]
+ Currently, ``"ignore_dots"`` means that all ``"."`` characters will be
+ removed from the local part of the validated email address.
+ :param bool ignorePlus: If ``True``, assume that
+ ``blackhole+kerr@xxxxxxxxxxxxxx`` is an alias for
+ ``blackhole@xxxxxxxxxxxxxx``, and remove everything after the first
+ ``'+'`` character.
+ :raises BadEmail: if the email address could not be parsed or validated.
+ :rtype: str
+ :returns: The validated, normalised email address, if it was from a
+ permitted domain. Otherwise, returns an empty string.
+ """
+ emailaddr = emailaddr.lower()
+ localpart, domain = extractEmailAddress(emailaddr)
+ canonical = canonicalizeEmailDomain(domain, domainmap)
+
+ if ignorePlus:
+ idx = localpart.find('+')
+ if idx >= 0:
+ localpart = localpart[:idx]
+
+ rules = domainrules.get(canonical, [])
+ if 'ignore_dots' in rules:
+ localpart = localpart.replace(".", "")
+
+ normalized = "%s@%s" % (localpart, domain)
+ return normalized
+
class PortList(object):
"""A container class for validated port ranges.
diff --git a/lib/bridgedb/test/test_EmailServer.py b/lib/bridgedb/test/test_EmailServer.py
index f828be7..8521762 100644
--- a/lib/bridgedb/test/test_EmailServer.py
+++ b/lib/bridgedb/test/test_EmailServer.py
@@ -20,10 +20,10 @@ from io import StringIO
import copy
from bridgedb import EmailServer
-from bridgedb.Dist import BadEmail
from bridgedb.Dist import EmailBasedDistributor
from bridgedb.EmailServer import MailContext
from bridgedb.Time import NoSchedule
+from bridgedb.parse.addr import BadEmail
from bridgedb.persistent import Conf
from bridgedb.test.util import fileCheckDecorator
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits