[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[or-cvs] [metrics-utils/master] Fixed the issue of not converting the apache log datetime to UTC. Also made a couple of improvements suggested by Damian Johnson. Next up: parallelizing get_exitlist to make the program less IO bound



Author: Kiyoto Tamura <owenestea@xxxxxxxxx>
Date: Wed, 20 Oct 2010 16:54:10 -0500
Subject: Fixed the issue of not converting the apache log datetime to UTC. Also made a couple of improvements suggested by Damian Johnson. Next up: parallelizing get_exitlist to make the program less IO bound
Commit: 19f2ed89a34b2f6f29488c964bced7219cd0a124

---
 visitor/visitor.py |   27 +++++++++++++++------------
 1 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/visitor/visitor.py b/visitor/visitor.py
index c87222d..c15134d 100644
--- a/visitor/visitor.py
+++ b/visitor/visitor.py
@@ -7,21 +7,20 @@
 import re
 import sys
 import os
-import doctest
-from datetime import datetime, date, timedelta
+from datetime import datetime, timedelta
 import bisect
-from time import strptime # datetime.strptime does not exist for version < 2.5
+from time import strptime, mktime, gmtime # datetime.strptime does not exist for version < 2.5
 from cStringIO import StringIO
 
 # regexes used in the script
 IP_RE = re.compile(r'(\d+\.){3}\d+')
 APACHE_DATETIME = re.compile(r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}) -\d{4}\]')
-TOR_USERAGENTS = [('torbutton1_2_0rc1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
-                                                   r'en-US; rv\:1\.8\.1\.14\) '
-                                                   r'Gecko/20080404 Firefox/2\.0\.0\.14')),
-                  ('torbutton1_2_0', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
+TOR_USERAGENTS = [('torbutton1_2_0', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
                                                 r'[a-z]{2}-[A-Z]{2}; rv\:1\.8\.1\.16\) '
                                                 r'Gecko/20080702 Firefox/2\.0\.0\.16')),
+                  ('torbutton1_2_0rc1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
+                                                   r'en-US; rv\:1\.8\.1\.14\) '
+                                                   r'Gecko/20080404 Firefox/2\.0\.0\.14')),
                   ('torbutton1_2_1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; '
                                                 r'en-US; rv\:1\.9\.0\.7\) '
                                                 r'Gecko/2009021910 Firefox/3\.0\.7')),
@@ -59,9 +58,10 @@ def get_exitlist(exitlist_filepath):
 
 def apache_time2datetime(time_str):
     """
-    transforms the apache time to a Python datetime object
+    Transforms the apache time to a Python datetime object.
     """
-    yr, mo, d, h, m, s, _, _, _ = strptime(time_str, '%d/%b/%Y:%H:%M:%S')
+    # We need to convert the time to UTC
+    yr, mo, d, h, m, s, _, _, _ = gmtime(mktime(strptime(time_str, '%d/%b/%Y:%H:%M:%S')))
     return datetime(yr, mo, d, h, m, s)
 
 def parse_apache_line(log_line):
@@ -104,7 +104,7 @@ def analyze(apache_log_path, exitlist_path, output = sys.stdout):
     try:
         apache_log_file = open(apache_log_path)
     except IOError:
-        raise IOError('Could not open %s. Please check the path to the access log again')
+        raise IOError('Could not open %s. Please check the path to the access log again'%apache_log_path)
 
     for apache_line in apache_log_file:
         ip, user_agent, apache_datetime = parse_apache_line(apache_line)
@@ -142,7 +142,7 @@ def analyze(apache_log_path, exitlist_path, output = sys.stdout):
 
     while curr_apache_date <= last_apache_date:
         stats = tor_stats.get(curr_apache_date)
-        if stats is None:
+        if stats == None:
             stats = {'date': curr_apache_date}
         buffer.write(','.join([str(stats.get(col, 'N/A')) for col in col_list]) + '\n')
         curr_apache_date += timedelta(1)
@@ -167,4 +167,7 @@ usage: python visitor.py <access_log path> <exit list path> (<output file path>)
     else:
         access_log_path = sys.argv[1]
         exitlist_path = sys.argv[2]
-        analyze(access_log_path, exitlist_path)
+        if argc > 3:
+            analyze(access_log_path, exitlist_path, sys.argv[3])
+        else:
+            analyze(access_log_path, exitlist_path)
-- 
1.7.1