[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] [metrics-utils/master] Fixed the issue of not converting the apache log datetime to UTC. Also made a couple of improvements suggested by Damian Johnson. Next up: parallelizing get_exitlist to make the program less IO bound
Author: Kiyoto Tamura <owenestea@xxxxxxxxx>
Date: Wed, 20 Oct 2010 16:54:10 -0500
Subject: Fixed the issue of not converting the apache log datetime to UTC. Also made a couple of improvements suggested by Damian Johnson. Next up: parallelizing get_exitlist to make the program less IO bound
Commit: 19f2ed89a34b2f6f29488c964bced7219cd0a124
---
visitor/visitor.py | 27 +++++++++++++++------------
1 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/visitor/visitor.py b/visitor/visitor.py
index c87222d..c15134d 100644
--- a/visitor/visitor.py
+++ b/visitor/visitor.py
@@ -7,21 +7,20 @@
import re
import sys
import os
-import doctest
-from datetime import datetime, date, timedelta
+from datetime import datetime, timedelta
import bisect
-from time import strptime # datetime.strptime does not exist for version < 2.5
+from time import strptime, mktime, gmtime # datetime.strptime does not exist for version < 2.5
from cStringIO import StringIO
# regexes used in the script
IP_RE = re.compile(r'(\d+\.){3}\d+')
APACHE_DATETIME = re.compile(r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}) -\d{4}\]')
-TOR_USERAGENTS = [('torbutton1_2_0rc1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
- r'en-US; rv\:1\.8\.1\.14\) '
- r'Gecko/20080404 Firefox/2\.0\.0\.14')),
- ('torbutton1_2_0', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
+TOR_USERAGENTS = [('torbutton1_2_0', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
r'[a-z]{2}-[A-Z]{2}; rv\:1\.8\.1\.16\) '
r'Gecko/20080702 Firefox/2\.0\.0\.16')),
+ ('torbutton1_2_0rc1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
+ r'en-US; rv\:1\.8\.1\.14\) '
+ r'Gecko/20080404 Firefox/2\.0\.0\.14')),
('torbutton1_2_1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
r'en-US; rv\:1\.9\.0\.7\) '
r'Gecko/2009021910 Firefox/3\.0\.7')),
@@ -59,9 +58,10 @@ def get_exitlist(exitlist_filepath):
def apache_time2datetime(time_str):
"""
- transforms the apache time to a Python datetime object
+ Transforms the apache time to a Python datetime object.
"""
- yr, mo, d, h, m, s, _, _, _ = strptime(time_str, '%d/%b/%Y:%H:%M:%S')
+ # We need to convert the time to UTC
+ yr, mo, d, h, m, s, _, _, _ = gmtime(mktime(strptime(time_str, '%d/%b/%Y:%H:%M:%S')))
return datetime(yr, mo, d, h, m, s)
def parse_apache_line(log_line):
@@ -104,7 +104,7 @@ def analyze(apache_log_path, exitlist_path, output = sys.stdout):
try:
apache_log_file = open(apache_log_path)
except IOError:
- raise IOError('Could not open %s. Please check the path to the access log again')
+ raise IOError('Could not open %s. Please check the path to the access log again'%apache_log_path)
for apache_line in apache_log_file:
ip, user_agent, apache_datetime = parse_apache_line(apache_line)
@@ -142,7 +142,7 @@ def analyze(apache_log_path, exitlist_path, output = sys.stdout):
while curr_apache_date <= last_apache_date:
stats = tor_stats.get(curr_apache_date)
- if stats is None:
+ if stats == None:
stats = {'date': curr_apache_date}
buffer.write(','.join([str(stats.get(col, 'N/A')) for col in col_list]) + '\n')
curr_apache_date += timedelta(1)
@@ -167,4 +167,7 @@ usage: python visitor.py <access_log path> <exit list path> (<output file path>)
else:
access_log_path = sys.argv[1]
exitlist_path = sys.argv[2]
- analyze(access_log_path, exitlist_path)
+ if argc > 3:
+ analyze(access_log_path, exitlist_path, sys.argv[3])
+ else:
+ analyze(access_log_path, exitlist_path)
--
1.7.1