[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] [metrics-utils/master] Add visitor.py written by Kiyoto Tamura.
Author: Karsten Loesing <karsten.loesing@xxxxxxx>
Date: Wed, 20 Oct 2010 11:33:36 +0200
Subject: Add visitor.py written by Kiyoto Tamura.
Commit: 3884f121e6cfeb623acad27f2c5f30f81fa7a299
---
visitor/visitor.py | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 170 insertions(+), 0 deletions(-)
create mode 100644 visitor/visitor.py
diff --git a/visitor/visitor.py b/visitor/visitor.py
new file mode 100644
index 0000000..c87222d
--- /dev/null
+++ b/visitor/visitor.py
@@ -0,0 +1,170 @@
+# author: Kiyoto Tamura <owenestea@xxxxxxxxx>
+#
+# A Python port of Karsten Loesing's VisiTor.
+#
+#
+
+import re
+import sys
+import os
+import doctest
+from datetime import datetime, date, timedelta
+import bisect
+from time import strptime # datetime.strptime does not exist for version < 2.5
+from cStringIO import StringIO
+
+# regexes used in the script
+IP_RE = re.compile(r'(\d+\.){3}\d+')
+APACHE_DATETIME = re.compile(r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}) -\d{4}\]')
+TOR_USERAGENTS = [('torbutton1_2_0rc1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
+ r'en-US; rv\:1\.8\.1\.14\) '
+ r'Gecko/20080404 Firefox/2\.0\.0\.14')),
+ ('torbutton1_2_0', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
+ r'[a-z]{2}-[A-Z]{2}; rv\:1\.8\.1\.16\) '
+ r'Gecko/20080702 Firefox/2\.0\.0\.16')),
+ ('torbutton1_2_1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
+ r'en-US; rv\:1\.9\.0\.7\) '
+ r'Gecko/2009021910 Firefox/3\.0\.7')),
+ ('torbutton1_2_5', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 6\.1; '
+ r'[a-z]{2}-[A-Z]{2}; rv:1\.9\.2\.3\) '
+ r'Gecko/20100401 Firefox/3\.6\.3'))
+ ]
+
+
+class ApacheParseError(Exception): pass
+class NoExitListAvailableError(Exception): pass
+
+def get_exitlist(exitlist_filepath):
+ """
+ Returns a dictionary keyed by ip address. The value is a sorted list of timestamps when an
+ exist address was recorded.
+ """
+ exitlist = {}
+ for dirpath, _, filenames in os.walk(exitlist_filepath, topdown = False):
+ for filename in filenames:
+ fn = os.path.join(dirpath, filename)
+ try:
+ f = open(fn)
+ for line in f:
+ if line.startswith('ExitAddress'):
+ _, ip, dt = line.split(' ', 2)
+ yr, mo, d, h, m, s, _, _, _ = strptime(dt.rstrip('\s\n'), '%Y-%m-%d %H:%M:%S')
+ if not ip in exitlist:
+ exitlist[ip] = []
+ bisect.insort(exitlist[ip], datetime(yr, mo, d, h, m, s)) # maintain ordered list
+ except IOError:
+ print >> sys.stderr, 'could not open %s. Skipping it.'%fn
+
+ return exitlist
+
+def apache_time2datetime(time_str):
+ """
+ transforms the apache time to a Python datetime object
+ """
+ yr, mo, d, h, m, s, _, _, _ = strptime(time_str, '%d/%b/%Y:%H:%M:%S')
+ return datetime(yr, mo, d, h, m, s)
+
+def parse_apache_line(log_line):
+ """
+ Parses one line of Apache access log. It assumes that it isn in the "combined" format.
+ """
+ ip = IP_RE.match(log_line) # the IP address should occur at the beginning
+ if ip is None:
+ raise ApacheParseError("Could not match the IP address at the beginning of the line for %s"%log_line)
+ ip = ip.group(0)
+
+ apache_datetime = APACHE_DATETIME.search(log_line)
+ if apache_datetime is None:
+ raise ApacheParseError("Could not match the datetime for the line %s"%log_line)
+ apache_datetime = apache_time2datetime(apache_datetime.group(1))
+
+ user_agent = log_line.split('" ')[-1].rstrip('\n')
+
+ return ip, user_agent, apache_datetime # maybe turn it into a dict if it gets confusing
+ #return {'ip': ip, 'user_agent': user_agent, 'apache_datetime': apache_datetime}
+
+
+def is_tor(apache_ip, apache_time, exitlist):
+ if not apache_ip in exitlist: return False
+ timestamps = exitlist[apache_ip]
+ pos = bisect.bisect_left(timestamps, apache_time)
+ if pos >= len(timestamps): return False
+ return timestamps[pos] - apache_time <= timedelta(1)
+
+def analyze(apache_log_path, exitlist_path, output = sys.stdout):
+ """
+ The main script. It reads the exit list, and goes through the Apache access log line by line, and checks if
+ if it is a Tor request. TODO: filter out the bots.
+ """
+ exitlist = get_exitlist(exitlist_path)
+
+ tor_stats = {}
+ tor_ua = TOR_USERAGENTS
+
+ try:
+ apache_log_file = open(apache_log_path)
+ except IOError:
+ raise IOError('Could not open %s. Please check the path to the access log again')
+
+ for apache_line in apache_log_file:
+ ip, user_agent, apache_datetime = parse_apache_line(apache_line)
+ apache_date = apache_datetime.date()
+ if apache_date not in tor_stats:
+ d = {'date': apache_date, 'tor': 0, 'nottor': 0}
+ for tor_type, _ in tor_ua:
+ d[tor_type] = 0
+ tor_stats[apache_date] = d
+ stats = tor_stats[apache_date]
+
+ if is_tor(ip, apache_datetime, exitlist):
+ stats['tor'] += 1
+ for tor_type, tor_re in tor_ua:
+ if tor_re.search(user_agent):
+ stats[tor_type] += 1
+ break
+ else:
+ stats['nottor'] += 1
+
+ if not tor_stats:
+ print >> sys.stderr, 'No data to be written. Exiting'
+ return
+
+ # writing to a buffer
+ buffer = StringIO()
+ col_list = ['date', 'tor', 'nottor']
+ for tor_type, _ in tor_ua:
+ col_list.append(tor_type)
+ buffer.write(','.join(col_list) + '\n')
+ apache_dates = tor_stats.keys()
+ apache_dates.sort()
+ curr_apache_date = apache_dates[0]
+ last_apache_date = apache_dates[-1]
+
+ while curr_apache_date <= last_apache_date:
+ stats = tor_stats.get(curr_apache_date)
+ if stats is None:
+ stats = {'date': curr_apache_date}
+ buffer.write(','.join([str(stats.get(col, 'N/A')) for col in col_list]) + '\n')
+ curr_apache_date += timedelta(1)
+
+ if output != sys.stdout:
+ try:
+ ofile = open(output, 'w')
+ ofile.write(buffer.getvalue())
+
+ except:
+ raise IOError("Could not write results to %s. Exiting witout writing"%output)
+
+ else:
+ print buffer.getvalue()
+
+if __name__ == '__main__':
+ argc = len(sys.argv)
+ if argc < 3 or argc > 4:
+ print """
+usage: python visitor.py <access_log path> <exit list path> (<output file path>)
+ """
+ else:
+ access_log_path = sys.argv[1]
+ exitlist_path = sys.argv[2]
+ analyze(access_log_path, exitlist_path)
--
1.7.1